NUMA demotion counts towards reclaim targets in shrink_folio_list(), but
it does not reduce the total memory usage of a memcg. In memcg direct
reclaim paths (e.g., charge-triggered or manual limit writes), where
demotion is allowed, this leads to "fake progress" where the reclaim
loop concludes it has satisfied the memory request without actually
reducing the cgroup's charge.
This could result in inefficient reclaim loops, CPU waste, moving all
pages to far-tier nodes, and potentially premature OOM kills when the
cgroup is under memory pressure but demotion is still possible.
Introduce the MEMCG_RECLAIM_NO_DEMOTION flag to disable demotion in
these memcg-specific reclaim paths. This ensures that reclaim
progress is only counted when memory is actually freed or swapped out.
Signed-off-by: Bing Jiao <bingjiao@google.com>
---
include/linux/swap.h | 1 +
mm/memcontrol-v1.c | 10 ++++++++--
mm/memcontrol.c | 16 +++++++++++-----
mm/vmscan.c | 1 +
4 files changed, 21 insertions(+), 7 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7a09df6977a5..e83897a6dc72 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -356,6 +356,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
+#define MEMCG_RECLAIM_NO_DEMOTION (1 << 3)
#define MIN_SWAPPINESS 0
#define MAX_SWAPPINESS 200
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 433bba9dfe71..3cb600e28e5b 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -1466,6 +1466,10 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
int ret;
bool limits_invariant;
struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
+ unsigned int reclaim_options = MEMCG_RECLAIM_NO_DEMOTION;
+
+ if (!memsw)
+ reclaim_options |= MEMCG_RECLAIM_MAY_SWAP;
do {
if (signal_pending(current)) {
@@ -1500,7 +1504,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
}
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
+ reclaim_options, NULL)) {
ret = -EBUSY;
break;
}
@@ -1520,6 +1524,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
{
int nr_retries = MAX_RECLAIM_RETRIES;
+ unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
+ MEMCG_RECLAIM_NO_DEMOTION;
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
@@ -1532,7 +1538,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
return -EINTR;
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- MEMCG_RECLAIM_MAY_SWAP, NULL))
+ reclaim_options, NULL))
nr_retries--;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 303ac622d22d..fcf1cd0da643 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2287,6 +2287,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
gfp_t gfp_mask)
{
unsigned long nr_reclaimed = 0;
+ unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
+ MEMCG_RECLAIM_NO_DEMOTION;
do {
unsigned long pflags;
@@ -2300,7 +2302,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
psi_memstall_enter(&pflags);
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
gfp_mask,
- MEMCG_RECLAIM_MAY_SWAP,
+ reclaim_options,
NULL);
psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
@@ -2572,7 +2574,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
/* Avoid the refill and flush of the older stock */
batch = nr_pages;
- reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
+ reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_NO_DEMOTION;
if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (page_counter_try_charge(&memcg->memory, batch, &counter))
@@ -2610,7 +2612,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
- gfp_mask, reclaim_options, NULL);
+ gfp_mask, reclaim_options, NULL);
psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -4638,6 +4640,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_retries = MAX_RECLAIM_RETRIES;
+ unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
+ MEMCG_RECLAIM_NO_DEMOTION;
bool drained = false;
unsigned long high;
int err;
@@ -4669,7 +4673,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
}
reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
+ GFP_KERNEL, reclaim_options, NULL);
if (!reclaimed && !nr_retries--)
break;
@@ -4690,6 +4694,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
+ unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
+ MEMCG_RECLAIM_NO_DEMOTION;
bool drained = false;
unsigned long max;
int err;
@@ -4721,7 +4727,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_reclaims) {
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
+ GFP_KERNEL, reclaim_options, NULL))
nr_reclaims--;
continue;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 33287ba4a500..7a8617ba1748 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6809,6 +6809,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
+ .no_demotion = !!(reclaim_options & MEMCG_RECLAIM_NO_DEMOTION),
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put
--
2.53.0.851.ga537e3e6e9-goog
Hi Bing
On 3/18/26 4:37 AM, Bing Jiao wrote:
> NUMA demotion counts towards reclaim targets in shrink_folio_list(), but
> it does not reduce the total memory usage of a memcg. In memcg direct
> reclaim paths (e.g., charge-triggered or manual limit writes), where
> demotion is allowed, this leads to "fake progress" where the reclaim
> loop concludes it has satisfied the memory request without actually
> reducing the cgroup's charge.
>
> This could result in inefficient reclaim loops, CPU waste, moving all
> pages to far-tier nodes, and potentially premature OOM kills when the
> cgroup is under memory pressure but demotion is still possible.
>
> Introduce the MEMCG_RECLAIM_NO_DEMOTION flag to disable demotion in
> these memcg-specific reclaim paths. This ensures that reclaim
> progress is only counted when memory is actually freed or swapped out.
Thanks for the patch. With this change, are we completely disabling
memory tiering in memcg?
>
> Signed-off-by: Bing Jiao <bingjiao@google.com>
> ---
> include/linux/swap.h | 1 +
> mm/memcontrol-v1.c | 10 ++++++++--
> mm/memcontrol.c | 16 +++++++++++-----
> mm/vmscan.c | 1 +
> 4 files changed, 21 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 7a09df6977a5..e83897a6dc72 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -356,6 +356,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
>
> #define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
> #define MEMCG_RECLAIM_PROACTIVE (1 << 2)
> +#define MEMCG_RECLAIM_NO_DEMOTION (1 << 3)
> #define MIN_SWAPPINESS 0
> #define MAX_SWAPPINESS 200
>
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index 433bba9dfe71..3cb600e28e5b 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -1466,6 +1466,10 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
> int ret;
> bool limits_invariant;
> struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
> + unsigned int reclaim_options = MEMCG_RECLAIM_NO_DEMOTION;
> +
> + if (!memsw)
> + reclaim_options |= MEMCG_RECLAIM_MAY_SWAP;
>
> do {
> if (signal_pending(current)) {
> @@ -1500,7 +1504,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
> }
>
> if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
> - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
> + reclaim_options, NULL)) {
> ret = -EBUSY;
> break;
> }
> @@ -1520,6 +1524,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
> static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
> {
> int nr_retries = MAX_RECLAIM_RETRIES;
> + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> + MEMCG_RECLAIM_NO_DEMOTION;
>
> /* we call try-to-free pages for make this cgroup empty */
> lru_add_drain_all();
> @@ -1532,7 +1538,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
> return -EINTR;
>
> if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
> - MEMCG_RECLAIM_MAY_SWAP, NULL))
> + reclaim_options, NULL))
> nr_retries--;
> }
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 303ac622d22d..fcf1cd0da643 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2287,6 +2287,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
> gfp_t gfp_mask)
> {
> unsigned long nr_reclaimed = 0;
> + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> + MEMCG_RECLAIM_NO_DEMOTION;
>
> do {
> unsigned long pflags;
> @@ -2300,7 +2302,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
> psi_memstall_enter(&pflags);
> nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
> gfp_mask,
> - MEMCG_RECLAIM_MAY_SWAP,
> + reclaim_options,
> NULL);
> psi_memstall_leave(&pflags);
> } while ((memcg = parent_mem_cgroup(memcg)) &&
> @@ -2572,7 +2574,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
> /* Avoid the refill and flush of the older stock */
> batch = nr_pages;
>
> - reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
> + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_NO_DEMOTION;
> if (!do_memsw_account() ||
> page_counter_try_charge(&memcg->memsw, batch, &counter)) {
> if (page_counter_try_charge(&memcg->memory, batch, &counter))
> @@ -2610,7 +2612,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>
> psi_memstall_enter(&pflags);
> nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
> - gfp_mask, reclaim_options, NULL);
> + gfp_mask, reclaim_options, NULL);
> psi_memstall_leave(&pflags);
>
> if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
> @@ -4638,6 +4640,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> unsigned int nr_retries = MAX_RECLAIM_RETRIES;
> + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> + MEMCG_RECLAIM_NO_DEMOTION;
> bool drained = false;
> unsigned long high;
> int err;
> @@ -4669,7 +4673,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
> }
>
> reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
> - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
> + GFP_KERNEL, reclaim_options, NULL);
>
> if (!reclaimed && !nr_retries--)
> break;
> @@ -4690,6 +4694,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
> + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> + MEMCG_RECLAIM_NO_DEMOTION;
> bool drained = false;
> unsigned long max;
> int err;
> @@ -4721,7 +4727,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
>
> if (nr_reclaims) {
> if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
> - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
> + GFP_KERNEL, reclaim_options, NULL))
> nr_reclaims--;
> continue;
> }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 33287ba4a500..7a8617ba1748 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -6809,6 +6809,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> .may_unmap = 1,
> .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
> .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
> + .no_demotion = !!(reclaim_options & MEMCG_RECLAIM_NO_DEMOTION),
> };
> /*
> * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
Did you run any performance benchmarks with this patch?
This patch looks good to me. Feel free to add
Reviewed by: Donet Tom <donettom@linux.ibm.com>
> --
> 2.53.0.851.ga537e3e6e9-goog
>
>
On Fri, Mar 20, 2026 at 06:47:14PM +0530, Donet Tom wrote:
> Hi Bing
>
> On 3/18/26 4:37 AM, Bing Jiao wrote:
> > NUMA demotion counts towards reclaim targets in shrink_folio_list(), but
> > it does not reduce the total memory usage of a memcg. In memcg direct
> > reclaim paths (e.g., charge-triggered or manual limit writes), where
> > demotion is allowed, this leads to "fake progress" where the reclaim
> > loop concludes it has satisfied the memory request without actually
> > reducing the cgroup's charge.
> >
> > This could result in inefficient reclaim loops, CPU waste, moving all
> > pages to far-tier nodes, and potentially premature OOM kills when the
> > cgroup is under memory pressure but demotion is still possible.
> >
> > Introduce the MEMCG_RECLAIM_NO_DEMOTION flag to disable demotion in
> > these memcg-specific reclaim paths. This ensures that reclaim
> > progress is only counted when memory is actually freed or swapped out.
Hi, Donet,
Thank you for the feedback and reviewing the patch.
> Thanks for the patch. With this change, are we completely disabling memory
> tiering in memcg?
Yes, this change will completely disable demotion from memcg
directly reclaim, as demotion does not help to reduce memory usage.
>
> Did you run any performance benchmarks with this patch?
>
>
> This patch looks good to me. Feel free to add
>
> Reviewed by: Donet Tom <donettom@linux.ibm.com>
Thanks again for the review!
Following a discussion with Yosry regarding demotion as an aging process,
I have decided to drop patches 2 and 3 from this series for now.
Additionally, Joshua Hahn's RFC ('Make memcg limits tier-aware') [1]
introduces a mechanism to scale memcg limits based on the ratio of
top-tier to total memory. This approach or similar approaches might
provide a more comprehensive way to resolve 'fake progress' in memcg
direct reclaim or establish a better framework for addressing such
issues in the future.
Hope you have great weekend!
Best regards,
Bing
[1] https://lore.kernel.org/linux-mm/20260223223830.586018-1-joshua.hahnjy@gmail.com/
On Tue, Mar 17, 2026 at 4:07 PM Bing Jiao <bingjiao@google.com> wrote:
>
> NUMA demotion counts towards reclaim targets in shrink_folio_list(), but
> it does not reduce the total memory usage of a memcg. In memcg direct
> reclaim paths (e.g., charge-triggered or manual limit writes), where
> demotion is allowed, this leads to "fake progress" where the reclaim
> loop concludes it has satisfied the memory request without actually
> reducing the cgroup's charge.
>
> This could result in inefficient reclaim loops, CPU waste, moving all
> pages to far-tier nodes, and potentially premature OOM kills when the
> cgroup is under memory pressure but demotion is still possible.
>
> Introduce the MEMCG_RECLAIM_NO_DEMOTION flag to disable demotion in
> these memcg-specific reclaim paths. This ensures that reclaim
> progress is only counted when memory is actually freed or swapped out.
See the discussion @
https://lore.kernel.org/linux-mm/20250909012141.1467-1-cuishw@inspur.com/
and the commits/threads it is referring to.
>
> Signed-off-by: Bing Jiao <bingjiao@google.com>
> ---
> include/linux/swap.h | 1 +
> mm/memcontrol-v1.c | 10 ++++++++--
> mm/memcontrol.c | 16 +++++++++++-----
> mm/vmscan.c | 1 +
> 4 files changed, 21 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 7a09df6977a5..e83897a6dc72 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -356,6 +356,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
>
> #define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
> #define MEMCG_RECLAIM_PROACTIVE (1 << 2)
> +#define MEMCG_RECLAIM_NO_DEMOTION (1 << 3)
> #define MIN_SWAPPINESS 0
> #define MAX_SWAPPINESS 200
>
> diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
> index 433bba9dfe71..3cb600e28e5b 100644
> --- a/mm/memcontrol-v1.c
> +++ b/mm/memcontrol-v1.c
> @@ -1466,6 +1466,10 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
> int ret;
> bool limits_invariant;
> struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
> + unsigned int reclaim_options = MEMCG_RECLAIM_NO_DEMOTION;
> +
> + if (!memsw)
> + reclaim_options |= MEMCG_RECLAIM_MAY_SWAP;
>
> do {
> if (signal_pending(current)) {
> @@ -1500,7 +1504,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
> }
>
> if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
> - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, NULL)) {
> + reclaim_options, NULL)) {
> ret = -EBUSY;
> break;
> }
> @@ -1520,6 +1524,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
> static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
> {
> int nr_retries = MAX_RECLAIM_RETRIES;
> + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> + MEMCG_RECLAIM_NO_DEMOTION;
>
> /* we call try-to-free pages for make this cgroup empty */
> lru_add_drain_all();
> @@ -1532,7 +1538,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
> return -EINTR;
>
> if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
> - MEMCG_RECLAIM_MAY_SWAP, NULL))
> + reclaim_options, NULL))
> nr_retries--;
> }
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 303ac622d22d..fcf1cd0da643 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2287,6 +2287,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
> gfp_t gfp_mask)
> {
> unsigned long nr_reclaimed = 0;
> + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> + MEMCG_RECLAIM_NO_DEMOTION;
>
> do {
> unsigned long pflags;
> @@ -2300,7 +2302,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
> psi_memstall_enter(&pflags);
> nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
> gfp_mask,
> - MEMCG_RECLAIM_MAY_SWAP,
> + reclaim_options,
> NULL);
> psi_memstall_leave(&pflags);
> } while ((memcg = parent_mem_cgroup(memcg)) &&
> @@ -2572,7 +2574,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
> /* Avoid the refill and flush of the older stock */
> batch = nr_pages;
>
> - reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
> + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_NO_DEMOTION;
> if (!do_memsw_account() ||
> page_counter_try_charge(&memcg->memsw, batch, &counter)) {
> if (page_counter_try_charge(&memcg->memory, batch, &counter))
> @@ -2610,7 +2612,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
>
> psi_memstall_enter(&pflags);
> nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
> - gfp_mask, reclaim_options, NULL);
> + gfp_mask, reclaim_options, NULL);
> psi_memstall_leave(&pflags);
>
> if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
> @@ -4638,6 +4640,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> unsigned int nr_retries = MAX_RECLAIM_RETRIES;
> + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> + MEMCG_RECLAIM_NO_DEMOTION;
> bool drained = false;
> unsigned long high;
> int err;
> @@ -4669,7 +4673,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
> }
>
> reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
> - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL);
> + GFP_KERNEL, reclaim_options, NULL);
>
> if (!reclaimed && !nr_retries--)
> break;
> @@ -4690,6 +4694,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
> unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
> + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
> + MEMCG_RECLAIM_NO_DEMOTION;
> bool drained = false;
> unsigned long max;
> int err;
> @@ -4721,7 +4727,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
>
> if (nr_reclaims) {
> if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
> - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, NULL))
> + GFP_KERNEL, reclaim_options, NULL))
> nr_reclaims--;
> continue;
> }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 33287ba4a500..7a8617ba1748 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -6809,6 +6809,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> .may_unmap = 1,
> .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
> .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
> + .no_demotion = !!(reclaim_options & MEMCG_RECLAIM_NO_DEMOTION),
> };
> /*
> * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
> --
> 2.53.0.851.ga537e3e6e9-goog
>
On Tue, Mar 17, 2026 at 04:44:34PM -0700, Yosry Ahmed wrote: > On Tue, Mar 17, 2026 at 4:07 PM Bing Jiao <bingjiao@google.com> wrote: > > > > NUMA demotion counts towards reclaim targets in shrink_folio_list(), but > > it does not reduce the total memory usage of a memcg. In memcg direct > > reclaim paths (e.g., charge-triggered or manual limit writes), where > > demotion is allowed, this leads to "fake progress" where the reclaim > > loop concludes it has satisfied the memory request without actually > > reducing the cgroup's charge. > > > > This could result in inefficient reclaim loops, CPU waste, moving all > > pages to far-tier nodes, and potentially premature OOM kills when the > > cgroup is under memory pressure but demotion is still possible. > > > > Introduce the MEMCG_RECLAIM_NO_DEMOTION flag to disable demotion in > > these memcg-specific reclaim paths. This ensures that reclaim > > progress is only counted when memory is actually freed or swapped out. > > See the discussion @ > https://lore.kernel.org/linux-mm/20250909012141.1467-1-cuishw@inspur.com/ > and the commits/threads it is referring to. Hi Yosry, Thanks for pointing it out! I was unaware of the previous discussion regarding demotion as aging progress. I will drop patches 2 and 3 from this series and resubmit patch 1 as a standalone fix by replying to this thread. Thanks, Bing
In try_charge_memcg(), the 'reclaim_options' variable is initialized
once at the start of the function. However, the function contains a
retry loop. If reclaim_options were modified during an iteration
(e.g., by encountering a memsw limit), the modified state would
persist into subsequent retries.
This leads to incorrect reclaim behavior. Specifically,
MEMCG_RECLAIM_MAY_SWAP is cleared when the combined memcg->memsw limit
is reached. After reclaimation attemps, a subsequent retry may
successfully charge memcg->memsw but fail on the memcg->memory charge.
In this case, swapping should be permitted, but the carried-over state
prevents it.
Fix by moving the initialization of 'reclaim_options' inside the
retry loop, ensuring a clean state for every reclaim attempt.
Fixes: 73b73bac90d9 ("mm: vmpressure: don't count proactive reclaim in vmpressure")
Signed-off-by: Bing Jiao <bingjiao@google.com>
Reviewed-by: Yosry Ahmed <yosry@kernel.org>
---
v2:
- Dropped other patches.
- Refined commit message to clarify the impact of the leak (Yosry).
- Added Reviewed-by tag.
mm/memcontrol.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a47fb68dd65f..303ac622d22d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2558,7 +2558,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
struct page_counter *counter;
unsigned long nr_reclaimed;
bool passed_oom = false;
- unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
+ unsigned int reclaim_options;
bool drained = false;
bool raised_max_event = false;
unsigned long pflags;
@@ -2572,6 +2572,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
/* Avoid the refill and flush of the older stock */
batch = nr_pages;
+ reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (page_counter_try_charge(&memcg->memory, batch, &counter))
--
2.53.0.851.ga537e3e6e9-goog
On Wed, Mar 18, 2026 at 2:56 PM Bing Jiao <bingjiao@google.com> wrote:
>
> In try_charge_memcg(), the 'reclaim_options' variable is initialized
> once at the start of the function. However, the function contains a
> retry loop. If reclaim_options were modified during an iteration
> (e.g., by encountering a memsw limit), the modified state would
> persist into subsequent retries.
>
> This leads to incorrect reclaim behavior. Specifically,
> MEMCG_RECLAIM_MAY_SWAP is cleared when the combined memcg->memsw limit
> is reached. After reclaimation attemps, a subsequent retry may
> successfully charge memcg->memsw but fail on the memcg->memory charge.
> In this case, swapping should be permitted, but the carried-over state
> prevents it.
>
> Fix by moving the initialization of 'reclaim_options' inside the
> retry loop, ensuring a clean state for every reclaim attempt.
>
> Fixes: 73b73bac90d9 ("mm: vmpressure: don't count proactive reclaim in vmpressure")
The Fixes tag is still wrong, see my previous reply.
In try_charge_memcg(), the 'reclaim_options' variable is initialized
once at the start of the function. However, the function contains a
retry loop. If reclaim_options were modified during an iteration
(e.g., by encountering a memsw limit), the modified state would
persist into subsequent retries.
This leads to incorrect reclaim behavior. Specifically,
MEMCG_RECLAIM_MAY_SWAP is cleared when the combined memcg->memsw limit
is reached. After reclaimation attemps, a subsequent retry may
successfully charge memcg->memsw but fail on the memcg->memory charge.
In this case, swapping should be permitted, but the carried-over state
prevents it.
Fix by moving the initialization of 'reclaim_options' inside the
retry loop, ensuring a clean state for every reclaim attempt.
Fixes: 6539cc053869 ("mm: memcontrol: fold mem_cgroup_do_charge()")
Signed-off-by: Bing Jiao <bingjiao@google.com>
Reviewed-by: Yosry Ahmed <yosry@kernel.org>
---
v3:
- Corrected the Fixes tag (Yosry).
mm/memcontrol.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a47fb68dd65f..303ac622d22d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2558,7 +2558,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
struct page_counter *counter;
unsigned long nr_reclaimed;
bool passed_oom = false;
- unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
+ unsigned int reclaim_options;
bool drained = false;
bool raised_max_event = false;
unsigned long pflags;
@@ -2572,6 +2572,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
/* Avoid the refill and flush of the older stock */
batch = nr_pages;
+ reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (page_counter_try_charge(&memcg->memory, batch, &counter))
--
2.53.0.851.ga537e3e6e9-goog
On Wed 18-03-26 22:19:46, Bing Jiao wrote:
> In try_charge_memcg(), the 'reclaim_options' variable is initialized
> once at the start of the function. However, the function contains a
> retry loop. If reclaim_options were modified during an iteration
> (e.g., by encountering a memsw limit), the modified state would
> persist into subsequent retries.
>
> This leads to incorrect reclaim behavior. Specifically,
> MEMCG_RECLAIM_MAY_SWAP is cleared when the combined memcg->memsw limit
> is reached. After reclaimation attemps, a subsequent retry may
> successfully charge memcg->memsw but fail on the memcg->memory charge.
> In this case, swapping should be permitted, but the carried-over state
> prevents it.
Have you noticed this happening in practice or is this based on the code
reading?
> Fix by moving the initialization of 'reclaim_options' inside the
> retry loop, ensuring a clean state for every reclaim attempt.
>
> Fixes: 6539cc053869 ("mm: memcontrol: fold mem_cgroup_do_charge()")
> Signed-off-by: Bing Jiao <bingjiao@google.com>
> Reviewed-by: Yosry Ahmed <yosry@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Thanks!
> ---
> v3:
> - Corrected the Fixes tag (Yosry).
>
> mm/memcontrol.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index a47fb68dd65f..303ac622d22d 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2558,7 +2558,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
> struct page_counter *counter;
> unsigned long nr_reclaimed;
> bool passed_oom = false;
> - unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
> + unsigned int reclaim_options;
> bool drained = false;
> bool raised_max_event = false;
> unsigned long pflags;
> @@ -2572,6 +2572,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
> /* Avoid the refill and flush of the older stock */
> batch = nr_pages;
>
> + reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
> if (!do_memsw_account() ||
> page_counter_try_charge(&memcg->memsw, batch, &counter)) {
> if (page_counter_try_charge(&memcg->memory, batch, &counter))
> --
> 2.53.0.851.ga537e3e6e9-goog
--
Michal Hocko
SUSE Labs
On Thu, Mar 19, 2026 at 10:29:15AM +0100, Michal Hocko wrote: > On Wed 18-03-26 22:19:46, Bing Jiao wrote: > > In try_charge_memcg(), the 'reclaim_options' variable is initialized > > once at the start of the function. However, the function contains a > > retry loop. If reclaim_options were modified during an iteration > > (e.g., by encountering a memsw limit), the modified state would > > persist into subsequent retries. > > > > This leads to incorrect reclaim behavior. Specifically, > > MEMCG_RECLAIM_MAY_SWAP is cleared when the combined memcg->memsw limit > > is reached. After reclaimation attemps, a subsequent retry may > > successfully charge memcg->memsw but fail on the memcg->memory charge. > > In this case, swapping should be permitted, but the carried-over state > > prevents it. > > Have you noticed this happening in practice or is this based on the code > reading? Hi, Michal, thanks for the ack. This issue was identified during code reading, when I was analyzing the memsw limit behavior in try_charge_memcg(); specifically how retries are handled when demotion is disabled (the demotion patch itself was dropped). Best, Bing
On Fri 20-03-26 03:39:40, Bing Jiao wrote: > On Thu, Mar 19, 2026 at 10:29:15AM +0100, Michal Hocko wrote: > > On Wed 18-03-26 22:19:46, Bing Jiao wrote: > > > In try_charge_memcg(), the 'reclaim_options' variable is initialized > > > once at the start of the function. However, the function contains a > > > retry loop. If reclaim_options were modified during an iteration > > > (e.g., by encountering a memsw limit), the modified state would > > > persist into subsequent retries. > > > > > > This leads to incorrect reclaim behavior. Specifically, > > > MEMCG_RECLAIM_MAY_SWAP is cleared when the combined memcg->memsw limit > > > is reached. After reclaimation attemps, a subsequent retry may > > > successfully charge memcg->memsw but fail on the memcg->memory charge. > > > In this case, swapping should be permitted, but the carried-over state > > > prevents it. > > > > Have you noticed this happening in practice or is this based on the code > > reading? > > Hi, Michal, thanks for the ack. > > This issue was identified during code reading, when I was analyzing > the memsw limit behavior in try_charge_memcg(); specifically how > retries are handled when demotion is disabled (the demotion patch > itself was dropped). OK, that is always good to clarify in the changelog. -- Michal Hocko SUSE Labs
On Wed, Mar 18, 2026 at 10:19:46PM +0000, Bing Jiao wrote:
> In try_charge_memcg(), the 'reclaim_options' variable is initialized
> once at the start of the function. However, the function contains a
> retry loop. If reclaim_options were modified during an iteration
> (e.g., by encountering a memsw limit), the modified state would
> persist into subsequent retries.
>
> This leads to incorrect reclaim behavior. Specifically,
> MEMCG_RECLAIM_MAY_SWAP is cleared when the combined memcg->memsw limit
> is reached. After reclaimation attemps, a subsequent retry may
> successfully charge memcg->memsw but fail on the memcg->memory charge.
> In this case, swapping should be permitted, but the carried-over state
> prevents it.
>
> Fix by moving the initialization of 'reclaim_options' inside the
> retry loop, ensuring a clean state for every reclaim attempt.
>
> Fixes: 6539cc053869 ("mm: memcontrol: fold mem_cgroup_do_charge()")
> Signed-off-by: Bing Jiao <bingjiao@google.com>
> Reviewed-by: Yosry Ahmed <yosry@kernel.org>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
On Wed, Mar 18, 2026 at 10:19:46PM +0000, Bing Jiao wrote:
> In try_charge_memcg(), the 'reclaim_options' variable is initialized
> once at the start of the function. However, the function contains a
> retry loop. If reclaim_options were modified during an iteration
> (e.g., by encountering a memsw limit), the modified state would
> persist into subsequent retries.
>
> This leads to incorrect reclaim behavior. Specifically,
> MEMCG_RECLAIM_MAY_SWAP is cleared when the combined memcg->memsw limit
> is reached. After reclaimation attemps, a subsequent retry may
> successfully charge memcg->memsw but fail on the memcg->memory charge.
> In this case, swapping should be permitted, but the carried-over state
> prevents it.
>
> Fix by moving the initialization of 'reclaim_options' inside the
> retry loop, ensuring a clean state for every reclaim attempt.
>
> Fixes: 6539cc053869 ("mm: memcontrol: fold mem_cgroup_do_charge()")
> Signed-off-by: Bing Jiao <bingjiao@google.com>
> Reviewed-by: Yosry Ahmed <yosry@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
In try_charge_memcg(), the 'reclaim_options' variable is initialized
once at the start of the function. However, the function contains a
retry loop. If reclaim_options were modified during an iteration
(e.g., by encountering a memsw limit), the modified state would
persist into subsequent retries.
This leads to incorrect reclaim behavior. Specifically,
MEMCG_RECLAIM_MAY_SWAP is cleared when the combined memcg->memsw limit
is reached. After reclaimation attemps, a subsequent retry may
successfully charge memcg->memsw but fail on the memcg->memory charge.
In this case, swapping should be permitted, but the carried-over state
prevents it.
This issue was identified during code reading of try_charge_memcg()
while analyzing memsw limit behavior in tiered-memory systems;
no production failures have been reported yet.
Fix by moving the initialization of 'reclaim_options' inside the
retry loop, ensuring a clean state for every reclaim attempt.
Fixes: 6539cc053869 ("mm: memcontrol: fold mem_cgroup_do_charge()")
Signed-off-by: Bing Jiao <bingjiao@google.com>
Reviewed-by: Yosry Ahmed <yosry@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
---
v4:
- Clarify in the commit message that the issue was found via
code reading (Michal).
- Add ACKs (Michal and Johannes).
mm/memcontrol.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a47fb68dd65f..303ac622d22d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2558,7 +2558,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
struct page_counter *counter;
unsigned long nr_reclaimed;
bool passed_oom = false;
- unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
+ unsigned int reclaim_options;
bool drained = false;
bool raised_max_event = false;
unsigned long pflags;
@@ -2572,6 +2572,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
/* Avoid the refill and flush of the older stock */
batch = nr_pages;
+ reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (page_counter_try_charge(&memcg->memory, batch, &counter))
--
2.53.0.959.g497ff81fa9-goog
© 2016 - 2026 Red Hat, Inc.