[RFC PATCH v3 3/5] mm: memcontrol: add interface for swap tier selection

Youngjun Park posted 5 patches 1 week, 1 day ago
[RFC PATCH v3 3/5] mm: memcontrol: add interface for swap tier selection
Posted by Youngjun Park 1 week, 1 day ago
This patch integrates the swap tier infrastructure with cgroup,
enabling the selection of specific swap devices per cgroup by
configuring allowed swap tiers.

The new `memory.swap.tiers` interface controls allowed swap tiers via a mask.
By default, the mask is set to include all tiers, allowing specific tiers to
be excluded or restored. Note that effective tiers are calculated separately
using a dedicated mask to respect the cgroup hierarchy. Consequently,
configured tiers may differ from effective ones, as they must be a subset
of the parent's.

Note that cgroups do not pin swap tiers. This is similar to the
`cpuset` controller, which does not prevent CPU hotplug. This
approach ensures flexibility by allowing tier configuration changes
regardless of cgroup usage.

Signed-off-by: Youngjun Park <youngjun.park@lge.com>
---
 Documentation/admin-guide/cgroup-v2.rst | 27 ++++++++
 include/linux/memcontrol.h              |  3 +-
 mm/memcontrol.c                         | 85 +++++++++++++++++++++++
 mm/swap_state.c                         |  6 +-
 mm/swap_tier.c                          | 89 ++++++++++++++++++++++++-
 mm/swap_tier.h                          | 39 ++++++++++-
 mm/swapfile.c                           |  4 ++
 7 files changed, 246 insertions(+), 7 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 7f5b59d95fce..776a908ce1b9 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1848,6 +1848,33 @@ The following nested keys are defined.
 	Swap usage hard limit.  If a cgroup's swap usage reaches this
 	limit, anonymous memory of the cgroup will not be swapped out.
 
+  memory.swap.tiers
+        A read-write nested-keyed file which exists on non-root
+        cgroups. The default is to enable all tiers.
+
+        This interface allows selecting which swap tiers a cgroup can
+        use for swapping out memory.
+
+        The effective tiers are inherited from the parent. Only tiers
+        effective in the parent can be effective in the child. However,
+        the child can explicitly disable tiers allowed by the parent.
+
+        When read, the file shows two lines:
+          - The first line shows the operation string that was
+            written to this file.
+          - The second line shows the effective operation after
+            merging with parent settings.
+
+        When writing, the format is:
+          (+/-)(TIER_NAME) (+/-)(TIER_NAME) ...
+
+        Valid tier names are those configured in
+        /sys/kernel/mm/swap/tiers.
+
+        Each tier can be prefixed with:
+          +    Enable this tier
+          -    Disable this tier
+
   memory.swap.events
 	A read-only flat-keyed file which exists on non-root cgroups.
 	The following entries are defined.  Unless specified
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b6c82c8f73e1..542bee1b5f60 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -283,7 +283,8 @@ struct mem_cgroup {
 	/* per-memcg mm_struct list */
 	struct lru_gen_mm_list mm_list;
 #endif
-
+	int tier_mask;
+	int tier_effective_mask;
 #ifdef CONFIG_MEMCG_V1
 	/* Legacy consumer-oriented counters */
 	struct page_counter kmem;		/* v1 only */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 007413a53b45..5fcf8ebe0ca8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -68,6 +68,7 @@
 #include <net/ip.h>
 #include "slab.h"
 #include "memcontrol-v1.h"
+#include "swap_tier.h"
 
 #include <linux/uaccess.h>
 
@@ -3691,6 +3692,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
 {
 	lru_gen_exit_memcg(memcg);
 	memcg_wb_domain_exit(memcg);
+	swap_tiers_memcg_sync_mask(memcg);
 	__mem_cgroup_free(memcg);
 }
 
@@ -3792,6 +3794,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	WRITE_ONCE(memcg->zswap_writeback, true);
 #endif
 	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
+	memcg->tier_mask = TIER_ALL_MASK;
+	swap_tiers_memcg_inherit_mask(memcg, parent);
+
 	if (parent) {
 		WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
 
@@ -5352,6 +5357,80 @@ static int swap_events_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+static int swap_tier_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+	swap_tiers_mask_show(m, memcg->tier_mask);
+	swap_tiers_mask_show(m, memcg->tier_effective_mask);
+
+	return 0;
+}
+
+static ssize_t swap_tier_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	char *pos, *token;
+	int ret = 0;
+	int original_mask;
+
+	pos = strstrip(buf);
+
+	spin_lock(&swap_tier_lock);
+	if (!*pos) {
+		memcg->tier_mask = TIER_ALL_MASK;
+		goto sync;
+	}
+
+	original_mask = memcg->tier_mask;
+
+	while ((token = strsep(&pos, " \t\n")) != NULL) {
+		int mask;
+
+		if (!*token)
+			continue;
+
+		if (token[0] != '-' && token[0] != '+') {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		mask = swap_tiers_mask_lookup(token+1);
+		if (!mask) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		/*
+		 * if child already set, cannot add that tiers for hierarch mismatching.
+		 * parent compatible, child must respect parent selected swap device.
+		 */
+		switch (token[0]) {
+		case '-':
+			memcg->tier_mask &= ~mask;
+			break;
+		case '+':
+			memcg->tier_mask |= mask;
+			break;
+		default:
+			ret = -EINVAL;
+			break;
+		}
+
+		if (ret)
+			goto err;
+	}
+
+sync:
+	__swap_tiers_memcg_sync_mask(memcg);
+err:
+	if (ret)
+		memcg->tier_mask = original_mask;
+	spin_unlock(&swap_tier_lock);
+	return ret ? ret : nbytes;
+}
+
 static struct cftype swap_files[] = {
 	{
 		.name = "swap.current",
@@ -5384,6 +5463,12 @@ static struct cftype swap_files[] = {
 		.file_offset = offsetof(struct mem_cgroup, swap_events_file),
 		.seq_show = swap_events_show,
 	},
+	{
+		.name = "swap.tiers",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = swap_tier_show,
+		.write = swap_tier_write,
+	},
 	{ }	/* terminate */
 };
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d46ca61d2e42..c0dcab74779d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -961,6 +961,8 @@ static ssize_t tiers_store(struct kobject *kobj,
 	char *p, *token, *name, *tmp;
 	int ret = 0;
 	short prio;
+	int mask = 0;
+
 	DEFINE_SWAP_TIER_SAVE_CTX(ctx);
 
 	tmp = kstrdup(buf, GFP_KERNEL);
@@ -978,7 +980,7 @@ static ssize_t tiers_store(struct kobject *kobj,
 			continue;
 
 		if (token[0] == '-') {
-			ret = swap_tiers_remove(token + 1);
+			ret = swap_tiers_remove(token + 1, &mask);
 		} else {
 
 			name = strsep(&token, ":");
@@ -997,7 +999,7 @@ static ssize_t tiers_store(struct kobject *kobj,
 			goto restore;
 	}
 
-	if (!swap_tiers_update()) {
+	if (!swap_tiers_update(mask)) {
 		ret = -EINVAL;
 		goto restore;
 	}
diff --git a/mm/swap_tier.c b/mm/swap_tier.c
index 7741214312c7..0e067ba545cb 100644
--- a/mm/swap_tier.c
+++ b/mm/swap_tier.c
@@ -232,7 +232,7 @@ int swap_tiers_add(const char *name, int prio)
 	return ret;
 }
 
-int swap_tiers_remove(const char *name)
+int swap_tiers_remove(const char *name, int *mask)
 {
 	int ret = 0;
 	struct swap_tier *tier;
@@ -255,6 +255,8 @@ int swap_tiers_remove(const char *name)
 		list_prev_entry(tier, list)->prio = DEF_SWAP_PRIO;
 
 	list_move(&tier->list, &swap_tier_inactive_list);
+	*mask |= TIER_MASK(tier);
+
 	return ret;
 }
 
@@ -351,7 +353,17 @@ void swap_tiers_assign_dev(struct swap_info_struct *swp)
 	swp->tier_mask = TIER_DEFAULT_MASK;
 }
 
-bool swap_tiers_update(void)
+static void swap_tier_memcg_propagate(int mask)
+{
+	struct mem_cgroup *child;
+
+	for_each_mem_cgroup_tree(child, root_mem_cgroup) {
+		child->tier_mask |= mask;
+		child->tier_effective_mask |= mask;
+	}
+}
+
+bool swap_tiers_update(int mask)
 {
 	struct swap_tier *tier;
 	struct swap_info_struct *swp;
@@ -379,6 +391,79 @@ bool swap_tiers_update(void)
 			break;
 		swap_tiers_assign_dev(swp);
 	}
+	/*
+	 * XXX: Unused tiers default to ON, disabled after next tier added.
+	 * Use removed tier mask to clear settings for removed/re-added tiers.
+	 * (Could hold tier refs, but better to keep cgroup config independent)
+	 */
+	if (mask)
+		swap_tier_memcg_propagate(mask);
 
 	return true;
 }
+
+void swap_tiers_mask_show(struct seq_file *m, int mask)
+{
+	struct swap_tier *tier;
+
+	spin_lock(&swap_tier_lock);
+	for_each_active_tier(tier) {
+		if (mask & TIER_MASK(tier))
+			seq_printf(m, "%s ", tier->name);
+	}
+	spin_unlock(&swap_tier_lock);
+	seq_puts(m, "\n");
+}
+
+int swap_tiers_mask_lookup(const char *name)
+{
+	struct swap_tier *tier;
+
+	lockdep_assert_held(&swap_tier_lock);
+
+	for_each_active_tier(tier) {
+		if (!strcmp(name, tier->name))
+			return TIER_MASK(tier);
+	}
+
+	return 0;
+}
+
+static void __swap_tier_memcg_inherit_mask(struct mem_cgroup *memcg,
+	struct mem_cgroup *parent)
+{
+	int effective_mask
+		= parent ? parent->tier_effective_mask : TIER_ALL_MASK;
+
+	memcg->tier_effective_mask
+		= effective_mask & memcg->tier_mask;
+}
+
+void swap_tiers_memcg_inherit_mask(struct mem_cgroup *memcg,
+	struct mem_cgroup *parent)
+{
+	spin_lock(&swap_tier_lock);
+	__swap_tier_memcg_inherit_mask(memcg, parent);
+	spin_unlock(&swap_tier_lock);
+}
+
+void __swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg)
+{
+	struct mem_cgroup *child;
+
+	lockdep_assert_held(&swap_tier_lock);
+
+	if (memcg == root_mem_cgroup)
+		return;
+
+	for_each_mem_cgroup_tree(child, memcg)
+		__swap_tier_memcg_inherit_mask(child, parent_mem_cgroup(child));
+}
+
+void swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg)
+{
+	spin_lock(&swap_tier_lock);
+	memcg->tier_mask = TIER_ALL_MASK;
+	__swap_tiers_memcg_sync_mask(memcg);
+	spin_unlock(&swap_tier_lock);
+}
diff --git a/mm/swap_tier.h b/mm/swap_tier.h
index de81d540e3b5..9024c82c807a 100644
--- a/mm/swap_tier.h
+++ b/mm/swap_tier.h
@@ -31,19 +31,54 @@ struct swap_tier_save_ctx {
 #define TIER_DEFAULT_IDX	(31)
 #define TIER_DEFAULT_MASK	(1 << TIER_DEFAULT_IDX)
 
+#ifdef CONFIG_MEMCG
+static inline int folio_tier_effective_mask(struct folio *folio)
+{
+	struct mem_cgroup *memcg = folio_memcg(folio);
+
+	return memcg ? memcg->tier_effective_mask : TIER_ALL_MASK;
+}
+#else
+static inline int folio_tier_effective_mask(struct folio *folio)
+{
+	return TIER_ALL_MASK;
+}
+#endif
+
 /* Initialization and application */
 void swap_tiers_init(void);
 ssize_t swap_tiers_sysfs_show(char *buf);
 
 int swap_tiers_add(const char *name, int prio);
-int swap_tiers_remove(const char *name);
+int swap_tiers_remove(const char *name, int *mask);
 int swap_tiers_modify(const char *name, int prio);
 
 void swap_tiers_save(struct swap_tier_save_ctx ctx[]);
 void swap_tiers_restore(struct swap_tier_save_ctx ctx[]);
-bool swap_tiers_update(void);
+bool swap_tiers_update(int mask);
 
 /* Tier assignment */
 void swap_tiers_assign_dev(struct swap_info_struct *swp);
 
+/* Memcg related functions */
+void swap_tiers_mask_show(struct seq_file *m, int mask);
+void swap_tiers_memcg_inherit_mask(struct mem_cgroup *memcg,
+	struct mem_cgroup *parent);
+void swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg);
+void __swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg);
+
+/* Mask and tier lookup */
+int swap_tiers_mask_lookup(const char *name);
+
+/**
+ * swap_tiers_mask_test - Check if the tier mask is valid
+ * @tier_mask: The tier mask to check
+ * @mask: The mask to compare against
+ *
+ * Return: true if condition matches, false otherwise
+ */
+static inline bool swap_tiers_mask_test(int tier_mask, int mask)
+{
+	return tier_mask & mask;
+}
 #endif /* _SWAP_TIER_H */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4f8ce021c5bd..e04811e10431 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1348,10 +1348,14 @@ static bool swap_alloc_fast(struct folio *folio)
 static void swap_alloc_slow(struct folio *folio)
 {
 	struct swap_info_struct *si, *next;
+	int mask = folio_tier_effective_mask(folio);
 
 	spin_lock(&swap_avail_lock);
 start_over:
 	plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+		if (!swap_tiers_mask_test(si->tier_mask, mask))
+			continue;
+
 		/* Rotate the device and switch to a new cluster */
 		plist_requeue(&si->avail_list, &swap_avail_head);
 		spin_unlock(&swap_avail_lock);
-- 
2.34.1
Re: [RFC PATCH v3 3/5] mm: memcontrol: add interface for swap tier selection
Posted by Michal Koutný 5 days, 23 hours ago
Hi.

This is merely the API feedback.

(Feedback to the propsed form, I'm not sure whether/how this should
interact with memory.swap.max (formally cf io.weight).)

On Sat, Jan 31, 2026 at 09:54:52PM +0900, Youngjun Park <youngjun.park@lge.com> wrote:
> This patch integrates the swap tier infrastructure with cgroup,
> enabling the selection of specific swap devices per cgroup by
> configuring allowed swap tiers.
> 
> The new `memory.swap.tiers` interface controls allowed swap tiers via a mask.
> By default, the mask is set to include all tiers, allowing specific tiers to
> be excluded or restored. Note that effective tiers are calculated separately
> using a dedicated mask to respect the cgroup hierarchy. Consequently,
> configured tiers may differ from effective ones, as they must be a subset
> of the parent's.
> 
> Note that cgroups do not pin swap tiers. This is similar to the
> `cpuset` controller, which does not prevent CPU hotplug. This
> approach ensures flexibility by allowing tier configuration changes
> regardless of cgroup usage.
> 
> Signed-off-by: Youngjun Park <youngjun.park@lge.com>
> ---
>  Documentation/admin-guide/cgroup-v2.rst | 27 ++++++++
>  include/linux/memcontrol.h              |  3 +-
>  mm/memcontrol.c                         | 85 +++++++++++++++++++++++
>  mm/swap_state.c                         |  6 +-
>  mm/swap_tier.c                          | 89 ++++++++++++++++++++++++-
>  mm/swap_tier.h                          | 39 ++++++++++-
>  mm/swapfile.c                           |  4 ++
>  7 files changed, 246 insertions(+), 7 deletions(-)
> 
> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> index 7f5b59d95fce..776a908ce1b9 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -1848,6 +1848,33 @@ The following nested keys are defined.
>  	Swap usage hard limit.  If a cgroup's swap usage reaches this
>  	limit, anonymous memory of the cgroup will not be swapped out.
>  
> +  memory.swap.tiers
> +        A read-write nested-keyed file which exists on non-root

"nested-keyed" format is something else in this document's lingo, see
e.g. io.stat.

I think you wanted to make this resemble cgroup.subtree_control (which
is fine).

> +        cgroups. The default is to enable all tiers.
> +
> +        This interface allows selecting which swap tiers a cgroup can
> +        use for swapping out memory.
> +
> +        The effective tiers are inherited from the parent. Only tiers
> +        effective in the parent can be effective in the child. However,
> +        the child can explicitly disable tiers allowed by the parent.
> +
> +        When read, the file shows two lines:
> +          - The first line shows the operation string that was
> +            written to this file.
> +          - The second line shows the effective operation after
> +            merging with parent settings.

The convention (in cpuset) is to split it in two files like
memory.swap.tiers and memory.swap.tiers.effective.

> +
> +        When writing, the format is:
> +          (+/-)(TIER_NAME) (+/-)(TIER_NAME) ...
> +
> +        Valid tier names are those configured in
> +        /sys/kernel/mm/swap/tiers.
> +
> +        Each tier can be prefixed with:
> +          +    Enable this tier
> +          -    Disable this tier
> +

I believe these are only superficial adjustments not affecting the
implementation.

Thanks,
Michal
Re: [RFC PATCH v3 3/5] mm: memcontrol: add interface for swap tier selection
Posted by YoungJun Park 5 days, 9 hours ago
On Tue, Feb 03, 2026 at 11:54:41AM +0100, Michal Koutný wrote:
> Hi.
> 
> This is merely the API feedback.
> 
> (Feedback to the propsed form, I'm not sure whether/how this should
> interact with memory.swap.max (formally cf io.weight).)
> 
> On Sat, Jan 31, 2026 at 09:54:52PM +0900, Youngjun Park <youngjun.park@lge.com> wrote:
> > This patch integrates the swap tier infrastructure with cgroup,
> > enabling the selection of specific swap devices per cgroup by
> > configuring allowed swap tiers.
> > 
> > The new `memory.swap.tiers` interface controls allowed swap tiers via a mask.
> > By default, the mask is set to include all tiers, allowing specific tiers to
> > be excluded or restored. Note that effective tiers are calculated separately
> > using a dedicated mask to respect the cgroup hierarchy. Consequently,
> > configured tiers may differ from effective ones, as they must be a subset
> > of the parent's.
> > 
> > Note that cgroups do not pin swap tiers. This is similar to the
> > `cpuset` controller, which does not prevent CPU hotplug. This
> > approach ensures flexibility by allowing tier configuration changes
> > regardless of cgroup usage.
> > 
> > Signed-off-by: Youngjun Park <youngjun.park@lge.com>
> > ---
> >  Documentation/admin-guide/cgroup-v2.rst | 27 ++++++++
> >  include/linux/memcontrol.h              |  3 +-
> >  mm/memcontrol.c                         | 85 +++++++++++++++++++++++
> >  mm/swap_state.c                         |  6 +-
> >  mm/swap_tier.c                          | 89 ++++++++++++++++++++++++-
> >  mm/swap_tier.h                          | 39 ++++++++++-
> >  mm/swapfile.c                           |  4 ++
> >  7 files changed, 246 insertions(+), 7 deletions(-)
> > 
> > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
> > index 7f5b59d95fce..776a908ce1b9 100644
> > --- a/Documentation/admin-guide/cgroup-v2.rst
> > +++ b/Documentation/admin-guide/cgroup-v2.rst
> > @@ -1848,6 +1848,33 @@ The following nested keys are defined.
> >  	Swap usage hard limit.  If a cgroup's swap usage reaches this
> >  	limit, anonymous memory of the cgroup will not be swapped out.
> >  
> > +  memory.swap.tiers
> > +        A read-write nested-keyed file which exists on non-root
> 
> "nested-keyed" format is something else in this document's lingo, see
> e.g. io.stat.
> 
> I think you wanted to make this resemble cgroup.subtree_control (which
> is fine).

You are right, I used the wrong expression. 
Simply describing it as a "file" seems sufficient.

> 
> > +        cgroups. The default is to enable all tiers.
> > +
> > +        This interface allows selecting which swap tiers a cgroup can
> > +        use for swapping out memory.
> > +
> > +        The effective tiers are inherited from the parent. Only tiers
> > +        effective in the parent can be effective in the child. However,
> > +        the child can explicitly disable tiers allowed by the parent.
> > +
> > +        When read, the file shows two lines:
> > +          - The first line shows the operation string that was
> > +            written to this file.
> > +          - The second line shows the effective operation after
> > +            merging with parent settings.
> 
> The convention (in cpuset) is to split it in two files like
> memory.swap.tiers and memory.swap.tiers.effective.

I will separate the two according to the convention. 
Thanks for correction.

> > +
> > +        When writing, the format is:
> > +          (+/-)(TIER_NAME) (+/-)(TIER_NAME) ...
> > +
> > +        Valid tier names are those configured in
> > +        /sys/kernel/mm/swap/tiers.
> > +
> > +        Each tier can be prefixed with:
> > +          +    Enable this tier
> > +          -    Disable this tier
> > +
> 
> I believe these are only superficial adjustments not affecting the
> implementation.
> 
> Thanks,
> Michal

Thanks for the review, Michal.
Youngjun Park