Commit 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim")
introduces the cpuset.mems_effective check and applies it to
can_demote(). However, it does not apply this check in
demote_folio_list().
This omission leads to situations where pages are demoted to nodes
that are explicitly excluded from the task's cpuset.mems.
The impact is two-fold:
1. Resource Isolation: This bug breaks resource isolation provided
by cpuset.mems. It allows pages to be demoted to nodes that are
dedicated to other tasks or are intended for hot-unplugging.
2. Performance Issue: In multi-tier systems, users use cpuset.mems
to bind tasks to different performed-far tiers (e.g., avoiding
the slowest tiers for latency-sensitive data). This bug can
cause unexpected latency spikes if pages are demoted to the
farthest nodes.
To address the bug, implement a new function
mem_cgroup_filter_mems_allowed() to filter out nodes that are not
set in mems_effective, and update demote_folio_list() to utilize
this filtering logic. This ensures that demotions target respect
task's memory placement constraints.
Fixes: 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim")
Signed-off-by: Bing Jiao <bingjiao@google.com>
---
include/linux/cpuset.h | 6 ++++++
include/linux/memcontrol.h | 7 +++++++
kernel/cgroup/cpuset.c | 18 ++++++++++++++++++
mm/memcontrol.c | 6 ++++++
mm/vmscan.c | 13 ++++++++++---
5 files changed, 47 insertions(+), 3 deletions(-)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a98d3330385c..0e94548e2d24 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -175,6 +175,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
}
extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid);
+extern void cpuset_node_filter_allowed(struct cgroup *cgroup, nodemask_t *mask);
#else /* !CONFIG_CPUSETS */
static inline bool cpusets_enabled(void) { return false; }
@@ -305,6 +306,11 @@ static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
{
return true;
}
+
+static inline void cpuset_node_filter_allowed(struct cgroup *cgroup,
+ nodemask_t *mask)
+{
+}
#endif /* !CONFIG_CPUSETS */
#endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fd400082313a..7cfd71c57caa 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1742,6 +1742,8 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid);
+void mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg, nodemask_t *mask);
+
void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg);
static inline bool memcg_is_dying(struct mem_cgroup *memcg)
@@ -1816,6 +1818,11 @@ static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
return true;
}
+static inline bool mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg,
+ nodemask_t *mask)
+{
+}
+
static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
{
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 6e6eb09b8db6..2925bd6bca91 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4452,6 +4452,24 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
return allowed;
}
+void cpuset_node_filter_allowed(struct cgroup *cgroup, nodemask_t *mask)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *cs;
+
+ if (!cpuset_v2())
+ return;
+
+ css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
+ if (!css)
+ return;
+
+ /* Follows the same assumption in cpuset_node_allowed() */
+ cs = container_of(css, struct cpuset, css);
+ nodes_and(*mask, *mask, cs->effective_mems);
+ css_put(css);
+}
+
/**
* cpuset_spread_node() - On which node to begin search for a page
* @rotor: round robin rotor
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 75fc22a33b28..f414653867de 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5602,6 +5602,12 @@ bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
}
+void mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg, nodemask_t *mask)
+{
+ if (memcg)
+ cpuset_node_filter_allowed(memcg->css.cgroup, mask);
+}
+
void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
{
if (mem_cgroup_disabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 453d654727c1..4d23c491e914 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1018,7 +1018,8 @@ static struct folio *alloc_demote_folio(struct folio *src,
* Folios which are not demoted are left on @demote_folios.
*/
static unsigned int demote_folio_list(struct list_head *demote_folios,
- struct pglist_data *pgdat)
+ struct pglist_data *pgdat,
+ struct mem_cgroup *memcg)
{
int target_nid = next_demotion_node(pgdat->node_id);
unsigned int nr_succeeded;
@@ -1032,7 +1033,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
*/
.gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
__GFP_NOMEMALLOC | GFP_NOWAIT,
- .nid = target_nid,
.nmask = &allowed_mask,
.reason = MR_DEMOTION,
};
@@ -1044,6 +1044,13 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
return 0;
node_get_allowed_targets(pgdat, &allowed_mask);
+ /* Filter the given nmask based on cpuset.mems.allowed */
+ mem_cgroup_filter_mems_allowed(memcg, &allowed_mask);
+ if (nodes_empty(allowed_mask))
+ return 0;
+ if (!node_isset(target_nid, allowed_mask))
+ target_nid = node_random(&allowed_mask);
+ mtc.nid = target_nid;
/* Demotion ignores all cpuset and mempolicy settings */
migrate_pages(demote_folios, alloc_demote_folio, NULL,
@@ -1565,7 +1572,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
/* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
- nr_demoted = demote_folio_list(&demote_folios, pgdat);
+ nr_demoted = demote_folio_list(&demote_folios, pgdat, memcg);
nr_reclaimed += nr_demoted;
stat->nr_demoted += nr_demoted;
/* Folios that could not be demoted are still in @demote_folios */
--
2.52.0.351.gbe84eed79e-goog
On 2025/12/22 7:36, Bing Jiao wrote:
> Commit 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim")
> introduces the cpuset.mems_effective check and applies it to
> can_demote(). However, it does not apply this check in
> demote_folio_list().
>
> This omission leads to situations where pages are demoted to nodes
> that are explicitly excluded from the task's cpuset.mems.
> The impact is two-fold:
>
> 1. Resource Isolation: This bug breaks resource isolation provided
> by cpuset.mems. It allows pages to be demoted to nodes that are
> dedicated to other tasks or are intended for hot-unplugging.
>
> 2. Performance Issue: In multi-tier systems, users use cpuset.mems
> to bind tasks to different performed-far tiers (e.g., avoiding
> the slowest tiers for latency-sensitive data). This bug can
> cause unexpected latency spikes if pages are demoted to the
> farthest nodes.
>
> To address the bug, implement a new function
> mem_cgroup_filter_mems_allowed() to filter out nodes that are not
> set in mems_effective, and update demote_folio_list() to utilize
> this filtering logic. This ensures that demotions target respect
> task's memory placement constraints.
>
> Fixes: 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim")
> Signed-off-by: Bing Jiao <bingjiao@google.com>
> ---
> include/linux/cpuset.h | 6 ++++++
> include/linux/memcontrol.h | 7 +++++++
> kernel/cgroup/cpuset.c | 18 ++++++++++++++++++
> mm/memcontrol.c | 6 ++++++
> mm/vmscan.c | 13 ++++++++++---
> 5 files changed, 47 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
> index a98d3330385c..0e94548e2d24 100644
> --- a/include/linux/cpuset.h
> +++ b/include/linux/cpuset.h
> @@ -175,6 +175,7 @@ static inline void set_mems_allowed(nodemask_t nodemask)
> }
>
> extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid);
> +extern void cpuset_node_filter_allowed(struct cgroup *cgroup, nodemask_t *mask);
> #else /* !CONFIG_CPUSETS */
>
> static inline bool cpusets_enabled(void) { return false; }
> @@ -305,6 +306,11 @@ static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
> {
> return true;
> }
> +
> +static inline void cpuset_node_filter_allowed(struct cgroup *cgroup,
> + nodemask_t *mask)
> +{
> +}
> #endif /* !CONFIG_CPUSETS */
>
> #endif /* _LINUX_CPUSET_H */
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index fd400082313a..7cfd71c57caa 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1742,6 +1742,8 @@ static inline void count_objcg_events(struct obj_cgroup *objcg,
>
> bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid);
>
> +void mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg, nodemask_t *mask);
> +
> void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg);
>
> static inline bool memcg_is_dying(struct mem_cgroup *memcg)
> @@ -1816,6 +1818,11 @@ static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
> return true;
> }
>
> +static inline bool mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg,
> + nodemask_t *mask)
> +{
> +}
> +
> static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
> {
> }
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 6e6eb09b8db6..2925bd6bca91 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -4452,6 +4452,24 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
> return allowed;
> }
>
> +void cpuset_node_filter_allowed(struct cgroup *cgroup, nodemask_t *mask)
> +{
> + struct cgroup_subsys_state *css;
> + struct cpuset *cs;
> +
> + if (!cpuset_v2())
> + return;
> +
> + css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
> + if (!css)
> + return;
> +
> + /* Follows the same assumption in cpuset_node_allowed() */
> + cs = container_of(css, struct cpuset, css);
> + nodes_and(*mask, *mask, cs->effective_mems);
> + css_put(css);
> +}
> +
The functions cpuset_node_filter_allowed and cpuset_node_allowed are similar. We should create a
helper function to obtain cs->effective_mems, which can then be used by both
cpuset_node_filter_allowed and cpuset_node_allowed.
For example:
nodemask_t *mask cpuset_get_mem_allowed(struct cgroup *cgroup)
{
}
bool cpuset_node_allowed(struct cgroup *cgroup, int nid)
{
e_mask = cpuset_node_allowed(cgroup);
return allowed = node_isset(nid, mask);
}
void cpuset_node_filter_allowed(struct cgroup *cgroup, nodemask_t mask)
{
e_mask = cpuset_node_allowed(cgroup);
nodes_and(mask, *mask, e_mask);
}
Previously, I did not think we should distinguish between cgroup v1 and v2 here. This should be a
common function; at least based on its name, it should not be solely for v2.
> /**
> * cpuset_spread_node() - On which node to begin search for a page
> * @rotor: round robin rotor
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 75fc22a33b28..f414653867de 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5602,6 +5602,12 @@ bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid)
> return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true;
> }
>
> +void mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg, nodemask_t *mask)
> +{
> + if (memcg)
> + cpuset_node_filter_allowed(memcg->css.cgroup, mask);
> +}
> +
> void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
> {
> if (mem_cgroup_disabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 453d654727c1..4d23c491e914 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1018,7 +1018,8 @@ static struct folio *alloc_demote_folio(struct folio *src,
> * Folios which are not demoted are left on @demote_folios.
> */
> static unsigned int demote_folio_list(struct list_head *demote_folios,
> - struct pglist_data *pgdat)
> + struct pglist_data *pgdat,
> + struct mem_cgroup *memcg)
> {
> int target_nid = next_demotion_node(pgdat->node_id);
> unsigned int nr_succeeded;
> @@ -1032,7 +1033,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
> */
> .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
> __GFP_NOMEMALLOC | GFP_NOWAIT,
> - .nid = target_nid,
> .nmask = &allowed_mask,
> .reason = MR_DEMOTION,
> };
> @@ -1044,6 +1044,13 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
> return 0;
>
> node_get_allowed_targets(pgdat, &allowed_mask);
> + /* Filter the given nmask based on cpuset.mems.allowed */
> + mem_cgroup_filter_mems_allowed(memcg, &allowed_mask);
> + if (nodes_empty(allowed_mask))
> + return 0;
> + if (!node_isset(target_nid, allowed_mask))
> + target_nid = node_random(&allowed_mask);
> + mtc.nid = target_nid;
>
> /* Demotion ignores all cpuset and mempolicy settings */
> migrate_pages(demote_folios, alloc_demote_folio, NULL,
> @@ -1565,7 +1572,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> /* 'folio_list' is always empty here */
>
> /* Migrate folios selected for demotion */
> - nr_demoted = demote_folio_list(&demote_folios, pgdat);
> + nr_demoted = demote_folio_list(&demote_folios, pgdat, memcg);
> nr_reclaimed += nr_demoted;
> stat->nr_demoted += nr_demoted;
> /* Folios that could not be demoted are still in @demote_folios */
--
Best regards,
Ridong
Hi Bing,
kernel test robot noticed the following build warnings:
[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on tj-cgroup/for-next linus/master v6.19-rc2 next-20251219]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Bing-Jiao/mm-vmscan-respect-mems_effective-in-demote_folio_list/20251222-074143
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20251221233635.3761887-2-bingjiao%40google.com
patch subject: [PATCH v2 1/2] mm/vmscan: respect mems_effective in demote_folio_list()
config: arm-allnoconfig (https://download.01.org/0day-ci/archive/20251223/202512230553.LuiUveL3-lkp@intel.com/config)
compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 185f5fd5ce4c65116ca8cf6df467a682ef090499)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251223/202512230553.LuiUveL3-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512230553.LuiUveL3-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from kernel/sched/rq-offsets.c:5:
In file included from kernel/sched/sched.h:61:
In file included from include/linux/syscalls_api.h:1:
In file included from include/linux/syscalls.h:96:
In file included from include/trace/syscall.h:7:
In file included from include/linux/trace_events.h:10:
In file included from include/linux/perf_event.h:53:
In file included from include/linux/security.h:35:
In file included from include/linux/bpf.h:32:
>> include/linux/memcontrol.h:1824:1: warning: non-void function does not return a value [-Wreturn-type]
1824 | }
| ^
1 warning generated.
--
In file included from arch/arm/kernel/signal.c:12:
In file included from include/linux/resume_user_mode.h:8:
>> include/linux/memcontrol.h:1824:1: warning: non-void function does not return a value [-Wreturn-type]
1824 | }
| ^
arch/arm/kernel/signal.c:143:15: warning: variable 'aux' set but not used [-Wunused-but-set-variable]
143 | char __user *aux;
| ^
2 warnings generated.
--
In file included from kernel/sched/rq-offsets.c:5:
In file included from kernel/sched/sched.h:61:
In file included from include/linux/syscalls_api.h:1:
In file included from include/linux/syscalls.h:96:
In file included from include/trace/syscall.h:7:
In file included from include/linux/trace_events.h:10:
In file included from include/linux/perf_event.h:53:
In file included from include/linux/security.h:35:
In file included from include/linux/bpf.h:32:
>> include/linux/memcontrol.h:1824:1: warning: non-void function does not return a value [-Wreturn-type]
1824 | }
| ^
1 warning generated.
vim +1824 include/linux/memcontrol.h
1820
1821 static inline bool mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg,
1822 nodemask_t *mask)
1823 {
> 1824 }
1825
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi Bing,
kernel test robot noticed the following build warnings:
[auto build test WARNING on akpm-mm/mm-everything]
[also build test WARNING on tj-cgroup/for-next linus/master v6.19-rc2 next-20251219]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Bing-Jiao/mm-vmscan-respect-mems_effective-in-demote_folio_list/20251222-074143
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20251221233635.3761887-2-bingjiao%40google.com
patch subject: [PATCH v2 1/2] mm/vmscan: respect mems_effective in demote_folio_list()
config: alpha-allnoconfig (https://download.01.org/0day-ci/archive/20251223/202512230655.QvO6dmjt-lkp@intel.com/config)
compiler: alpha-linux-gcc (GCC) 15.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251223/202512230655.QvO6dmjt-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512230655.QvO6dmjt-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from include/linux/bpf.h:32,
from include/linux/security.h:35,
from include/linux/perf_event.h:53,
from include/linux/trace_events.h:10,
from include/trace/syscall.h:7,
from include/linux/syscalls.h:96,
from include/linux/syscalls_api.h:1,
from kernel/sched/sched.h:61,
from kernel/sched/rq-offsets.c:5:
include/linux/memcontrol.h: In function 'mem_cgroup_filter_mems_allowed':
>> include/linux/memcontrol.h:1824:1: warning: no return statement in function returning non-void [-Wreturn-type]
1824 | }
| ^
--
In file included from include/linux/bpf.h:32,
from include/linux/security.h:35,
from include/linux/perf_event.h:53,
from include/linux/trace_events.h:10,
from include/trace/syscall.h:7,
from include/linux/syscalls.h:96,
from include/linux/syscalls_api.h:1,
from kernel/sched/sched.h:61,
from kernel/sched/rq-offsets.c:5:
include/linux/memcontrol.h: In function 'mem_cgroup_filter_mems_allowed':
>> include/linux/memcontrol.h:1824:1: warning: no return statement in function returning non-void [-Wreturn-type]
1824 | }
| ^
vim +1824 include/linux/memcontrol.h
1820
1821 static inline bool mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg,
1822 nodemask_t *mask)
1823 {
> 1824 }
1825
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
© 2016 - 2026 Red Hat, Inc.