mm: add memory.compact_unevictable_allowed cgroup attribute

[PATCH] mm: add memory.compact_unevictable_allowed cgroup attribute

Posted by Daniil Tatianin 2 weeks, 6 days ago

The current global sysctl compact_unevictable_allowed is too coarse.
In environments with mixed workloads, we may want to protect specific
important cgroups from compaction to ensure their stability and
responsiveness, while allowing compaction for others.

This patch introduces a per-memcg compact_unevictable_allowed attribute.
This allows granular control over whether unevictable pages in a specific
cgroup can be compacted. The global sysctl still takes precedence if set
to disallow compaction, but this new setting allows opting out specific
cgroups.

This also adds a new ISOLATE_UNEVICTABLE_CHECK_MEMCG flag to
isolate_migratepages_block to preserve the old behavior for the
ISOLATE_UNEVICTABLE flag unconditionally used by
isolage_migratepages_range.

Signed-off-by: Daniil Tatianin <d-tatianin@yandex-team.ru>
---
 include/linux/memcontrol.h | 19 ++++++++++++++++++
 include/linux/mmzone.h     |  5 +++++
 mm/compaction.c            | 21 +++++++++++++++++---
 mm/memcontrol.c            | 40 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 82 insertions(+), 3 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 70b685a85bf4..13b7ef6cf511 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -227,6 +227,12 @@ struct mem_cgroup {
 	 */
 	bool oom_group;
 
+	/*
+	 * Is compaction allowed to take unevictable pages accounted to
+	 * this cgroup?
+	 */
+	bool compact_unevictable_allowed;
+
 	int swappiness;
 
 	/* memory.events and memory.events.local */
@@ -640,6 +646,14 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
 		page_counter_read(&memcg->memory);
 }
 
+static inline bool mem_cgroup_compact_unevictable_allowed(struct mem_cgroup *memcg)
+{
+	if (mem_cgroup_disabled() || !memcg)
+		return true;
+
+	return READ_ONCE(memcg->compact_unevictable_allowed);
+}
+
 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);
 
 /**
@@ -1092,6 +1106,11 @@ static inline bool mem_cgroup_disabled(void)
 	return true;
 }
 
+static inline bool mem_cgroup_compact_unevictable_allowed(struct mem_cgroup *memcg)
+{
+	return true;
+}
+
 static inline void memcg_memory_event(struct mem_cgroup *memcg,
 				      enum memcg_memory_event event)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3e51190a55e4..dadc9b66efa1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -701,6 +701,11 @@ struct lruvec {
 #define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x4)
 /* Isolate unevictable pages */
 #define ISOLATE_UNEVICTABLE	((__force isolate_mode_t)0x8)
+/*
+ * Isolate unevictable pages, but honor the page's cgroup settings if it
+ * explicitly disallows unevictable isolation.
+ */
+#define ISOLATE_UNEVICTABLE_CHECK_MEMCG ((__force isolate_mode_t)0x10)
 
 /* LRU Isolation modes. */
 typedef unsigned __bitwise isolate_mode_t;
diff --git a/mm/compaction.c b/mm/compaction.c
index 1e8f8eca318c..0dbb81aa5d2e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1098,8 +1098,22 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		is_unevictable = folio_test_unevictable(folio);
 
 		/* Compaction might skip unevictable pages but CMA takes them */
-		if (!(mode & ISOLATE_UNEVICTABLE) && is_unevictable)
-			goto isolate_fail_put;
+		if (is_unevictable) {
+			if (mode & ISOLATE_UNEVICTABLE_CHECK_MEMCG) {
+				struct mem_cgroup *memcg;
+
+				rcu_read_lock();
+				memcg = folio_memcg_check(folio);
+
+				if (!mem_cgroup_compact_unevictable_allowed(memcg)) {
+					rcu_read_unlock();
+					goto isolate_fail_put;
+				}
+
+				rcu_read_unlock();
+			} else if (!(mode & ISOLATE_UNEVICTABLE))
+				goto isolate_fail_put;
+		}
 
 		/*
 		 * To minimise LRU disruption, the caller can indicate with
@@ -2049,7 +2063,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
 	unsigned long low_pfn;
 	struct page *page;
 	const isolate_mode_t isolate_mode =
-		(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
+		(sysctl_compact_unevictable_allowed ?
+			ISOLATE_UNEVICTABLE_CHECK_MEMCG : 0) |
 		(cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
 	bool fast_find_block;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 772bac21d155..bd0230d93dd8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3839,6 +3839,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	WRITE_ONCE(memcg->zswap_writeback, true);
 #endif
 	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
+	WRITE_ONCE(memcg->compact_unevictable_allowed,
+		mem_cgroup_compact_unevictable_allowed(parent));
 	if (parent) {
 		WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
 
@@ -4608,6 +4610,37 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+static int memory_compact_unevictable_allowed_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+	seq_printf(m, "%d\n", READ_ONCE(memcg->compact_unevictable_allowed));
+
+	return 0;
+}
+
+static ssize_t memory_compact_unevictable_allowed_write(
+	struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int ret, allowed;
+
+	buf = strstrip(buf);
+	if (!buf)
+		return -EINVAL;
+
+	ret = kstrtoint(buf, 0, &allowed);
+	if (ret)
+		return ret;
+
+	if (allowed != 0 && allowed != 1)
+		return -EINVAL;
+
+	WRITE_ONCE(memcg->compact_unevictable_allowed, allowed);
+
+	return nbytes;
+}
+
 static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 			      size_t nbytes, loff_t off)
 {
@@ -4692,6 +4725,13 @@ static struct cftype memory_files[] = {
 		.flags = CFTYPE_NS_DELEGATABLE,
 		.write = memory_reclaim,
 	},
+	{
+		.name = "compact_unevictable_allowed",
+		/* For root use /proc/sys/vm/compact_unevictable_allowed */
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_compact_unevictable_allowed_show,
+		.write = memory_compact_unevictable_allowed_write,
+	},
 	{ }	/* terminate */
 };
 
-- 
2.34.1

Re: [PATCH] mm: add memory.compact_unevictable_allowed cgroup attribute

Posted by Andrew Morton 2 weeks, 6 days ago

On Tue, 17 Mar 2026 13:00:58 +0300 Daniil Tatianin <d-tatianin@yandex-team.ru> wrote:

> The current global sysctl compact_unevictable_allowed is too coarse.
> In environments with mixed workloads, we may want to protect specific
> important cgroups from compaction to ensure their stability and
> responsiveness, while allowing compaction for others.
> 
> This patch introduces a per-memcg compact_unevictable_allowed attribute.
> This allows granular control over whether unevictable pages in a specific
> cgroup can be compacted. The global sysctl still takes precedence if set
> to disallow compaction, but this new setting allows opting out specific
> cgroups.
> 
> This also adds a new ISOLATE_UNEVICTABLE_CHECK_MEMCG flag to
> isolate_migratepages_block to preserve the old behavior for the
> ISOLATE_UNEVICTABLE flag unconditionally used by
> isolage_migratepages_range.

AI review asked questions:
	https://sashiko.dev/#/patchset/20260317100058.2316997-1-d-tatianin@yandex-team.ru

Re: [PATCH] mm: add memory.compact_unevictable_allowed cgroup attribute

Posted by Daniil Tatianin 2 weeks, 6 days ago

On 3/17/26 10:17 PM, Andrew Morton wrote:
> On Tue, 17 Mar 2026 13:00:58 +0300 Daniil Tatianin <d-tatianin@yandex-team.ru> wrote:
>
>> The current global sysctl compact_unevictable_allowed is too coarse.
>> In environments with mixed workloads, we may want to protect specific
>> important cgroups from compaction to ensure their stability and
>> responsiveness, while allowing compaction for others.
>>
>> This patch introduces a per-memcg compact_unevictable_allowed attribute.
>> This allows granular control over whether unevictable pages in a specific
>> cgroup can be compacted. The global sysctl still takes precedence if set
>> to disallow compaction, but this new setting allows opting out specific
>> cgroups.
>>
>> This also adds a new ISOLATE_UNEVICTABLE_CHECK_MEMCG flag to
>> isolate_migratepages_block to preserve the old behavior for the
>> ISOLATE_UNEVICTABLE flag unconditionally used by
>> isolage_migratepages_range.
> AI review asked questions:
> 	https://sashiko.dev/#/patchset/20260317100058.2316997-1-d-tatianin@yandex-team.ru

> Should this dynamically walk up the ancestor chain during evaluation to
> ensure it returns false if any ancestor has disallowed compaction?

I think ultimately it's up to cgroup maintainers whether the code should 
do that, but as far as I understand the whole point of cgroups is that a 
child can override the settings of its parent. Moreover, this property 
doesn't have CFTYPE_NS_DELEGATABLE set, so a child cgroup cannot just 
toggle it at will.

Re: [PATCH] mm: add memory.compact_unevictable_allowed cgroup attribute

Posted by Michal Hocko 2 weeks, 5 days ago

On Tue 17-03-26 23:17:28, Daniil Tatianin wrote:
> 
> On 3/17/26 10:17 PM, Andrew Morton wrote:
> > On Tue, 17 Mar 2026 13:00:58 +0300 Daniil Tatianin <d-tatianin@yandex-team.ru> wrote:
> > 
> > > The current global sysctl compact_unevictable_allowed is too coarse.
> > > In environments with mixed workloads, we may want to protect specific
> > > important cgroups from compaction to ensure their stability and
> > > responsiveness, while allowing compaction for others.
> > > 
> > > This patch introduces a per-memcg compact_unevictable_allowed attribute.
> > > This allows granular control over whether unevictable pages in a specific
> > > cgroup can be compacted. The global sysctl still takes precedence if set
> > > to disallow compaction, but this new setting allows opting out specific
> > > cgroups.
> > > 
> > > This also adds a new ISOLATE_UNEVICTABLE_CHECK_MEMCG flag to
> > > isolate_migratepages_block to preserve the old behavior for the
> > > ISOLATE_UNEVICTABLE flag unconditionally used by
> > > isolage_migratepages_range.
> > AI review asked questions:
> > 	https://sashiko.dev/#/patchset/20260317100058.2316997-1-d-tatianin@yandex-team.ru
> 
> > Should this dynamically walk up the ancestor chain during evaluation to
> > ensure it returns false if any ancestor has disallowed compaction?
> 
> I think ultimately it's up to cgroup maintainers whether the code should do
> that, but as far as I understand the whole point of cgroups is that a child
> can override the settings of its parent. Moreover, this property doesn't
> have CFTYPE_NS_DELEGATABLE set, so a child cgroup cannot just toggle it at
> will.

In general any attributes should have proper hieararchical semantic. I
am not sure what that should be in this case. What is a desire in a
child cgroup can become fragmentation pressure to others.

I think it would be really important to explain more thoroughly about
those usecases of mixed workloads. Is the memcg even a suitable level of
abstraction for this tunable? Doesn't this belong to tasks if anything?
-- 
Michal Hocko
SUSE Labs

Re: [PATCH] mm: add memory.compact_unevictable_allowed cgroup attribute

Posted by Daniil Tatianin 2 weeks, 5 days ago

On 3/18/26 11:25 AM, Michal Hocko wrote:
> On Tue 17-03-26 23:17:28, Daniil Tatianin wrote:
>> On 3/17/26 10:17 PM, Andrew Morton wrote:
>>> On Tue, 17 Mar 2026 13:00:58 +0300 Daniil Tatianin<d-tatianin@yandex-team.ru> wrote:
>>>
>>>> The current global sysctl compact_unevictable_allowed is too coarse.
>>>> In environments with mixed workloads, we may want to protect specific
>>>> important cgroups from compaction to ensure their stability and
>>>> responsiveness, while allowing compaction for others.
>>>>
>>>> This patch introduces a per-memcg compact_unevictable_allowed attribute.
>>>> This allows granular control over whether unevictable pages in a specific
>>>> cgroup can be compacted. The global sysctl still takes precedence if set
>>>> to disallow compaction, but this new setting allows opting out specific
>>>> cgroups.
>>>>
>>>> This also adds a new ISOLATE_UNEVICTABLE_CHECK_MEMCG flag to
>>>> isolate_migratepages_block to preserve the old behavior for the
>>>> ISOLATE_UNEVICTABLE flag unconditionally used by
>>>> isolage_migratepages_range.
>>> AI review asked questions:
>>> 	https://sashiko.dev/#/patchset/20260317100058.2316997-1-d-tatianin@yandex-team.ru
>>> Should this dynamically walk up the ancestor chain during evaluation to
>>> ensure it returns false if any ancestor has disallowed compaction?
>> I think ultimately it's up to cgroup maintainers whether the code should do
>> that, but as far as I understand the whole point of cgroups is that a child
>> can override the settings of its parent. Moreover, this property doesn't
>> have CFTYPE_NS_DELEGATABLE set, so a child cgroup cannot just toggle it at
>> will.
> In general any attributes should have proper hieararchical semantic. I
> am not sure what that should be in this case. What is a desire in a
> child cgroup can become fragmentation pressure to others.

 >
 > I think it would be really important to explain more thoroughly about
 > those usecases of mixed workloads.

I think there are many examples of a system where one process is more 
important than
others. For example, any sort of healthcheck or even the ssh daemon: 
these may become
unresponsive during heavy compaction due to thousands of TLB invalidate 
IPIs or page faulting
on pages that are being compacted. Another example is a VM that is 
responsible for routing
traffic of all other VMs or even the entire cluster, you really want to 
prioritize its responsiveness, while
still allowing compaction of memory for the rest of the system, for less 
important VMs or services etc.

 > Is the memcg even a suitable level of
 > abstraction for this tunable?

In my opinion it is, since it is relatively common to put all related 
tasks into one cgroup with preset memory limits etc.

 > Doesn't this belong to tasks if anything?

I think it would be very difficult to implement as a per-task attribute 
properly since compaction works at the folio
level. While folios have a pointer to the memcg that owns them, they may 
be mapped by multiple process in case
of shared memory. We would have to find all the address spaces mapping 
this folio, and then check the property on
every one of them, which may be set to different values. This may be 
problematic performance-wise to do for
every physical page, and it also introduces unclear semantics if 
different address spaces mapping the same page
have different opinions.

(resend because of html formatting in the previous email)