This patch integrates the swap tier infrastructure with cgroup,
enabling the selection of specific swap devices per cgroup by
configuring allowed swap tiers.
The new `memory.swap.tiers` interface controls allowed swap tiers via a mask.
By default, the mask is set to include all tiers, allowing specific tiers to
be excluded or restored. Note that effective tiers are calculated separately
using a dedicated mask to respect the cgroup hierarchy. Consequently,
configured tiers may differ from effective ones, as they must be a subset
of the parent's.
Note that cgroups do not pin swap tiers. This is similar to the
`cpuset` controller, which does not prevent CPU hotplug. This
approach ensures flexibility by allowing tier configuration changes
regardless of cgroup usage.
Signed-off-by: Youngjun Park <youngjun.park@lge.com>
---
Documentation/admin-guide/cgroup-v2.rst | 27 ++++++++
include/linux/memcontrol.h | 3 +-
mm/memcontrol.c | 85 +++++++++++++++++++++++
mm/swap_state.c | 6 +-
mm/swap_tier.c | 89 ++++++++++++++++++++++++-
mm/swap_tier.h | 39 ++++++++++-
mm/swapfile.c | 4 ++
7 files changed, 246 insertions(+), 7 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 7f5b59d95fce..776a908ce1b9 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1848,6 +1848,33 @@ The following nested keys are defined.
Swap usage hard limit. If a cgroup's swap usage reaches this
limit, anonymous memory of the cgroup will not be swapped out.
+ memory.swap.tiers
+ A read-write nested-keyed file which exists on non-root
+ cgroups. The default is to enable all tiers.
+
+ This interface allows selecting which swap tiers a cgroup can
+ use for swapping out memory.
+
+ The effective tiers are inherited from the parent. Only tiers
+ effective in the parent can be effective in the child. However,
+ the child can explicitly disable tiers allowed by the parent.
+
+ When read, the file shows two lines:
+ - The first line shows the operation string that was
+ written to this file.
+ - The second line shows the effective operation after
+ merging with parent settings.
+
+ When writing, the format is:
+ (+/-)(TIER_NAME) (+/-)(TIER_NAME) ...
+
+ Valid tier names are those configured in
+ /sys/kernel/mm/swap/tiers.
+
+ Each tier can be prefixed with:
+ + Enable this tier
+ - Disable this tier
+
memory.swap.events
A read-only flat-keyed file which exists on non-root cgroups.
The following entries are defined. Unless specified
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b6c82c8f73e1..542bee1b5f60 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -283,7 +283,8 @@ struct mem_cgroup {
/* per-memcg mm_struct list */
struct lru_gen_mm_list mm_list;
#endif
-
+ int tier_mask;
+ int tier_effective_mask;
#ifdef CONFIG_MEMCG_V1
/* Legacy consumer-oriented counters */
struct page_counter kmem; /* v1 only */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 007413a53b45..5fcf8ebe0ca8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -68,6 +68,7 @@
#include <net/ip.h>
#include "slab.h"
#include "memcontrol-v1.h"
+#include "swap_tier.h"
#include <linux/uaccess.h>
@@ -3691,6 +3692,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
{
lru_gen_exit_memcg(memcg);
memcg_wb_domain_exit(memcg);
+ swap_tiers_memcg_sync_mask(memcg);
__mem_cgroup_free(memcg);
}
@@ -3792,6 +3794,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
WRITE_ONCE(memcg->zswap_writeback, true);
#endif
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
+ memcg->tier_mask = TIER_ALL_MASK;
+ swap_tiers_memcg_inherit_mask(memcg, parent);
+
if (parent) {
WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
@@ -5352,6 +5357,80 @@ static int swap_events_show(struct seq_file *m, void *v)
return 0;
}
+static int swap_tier_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ swap_tiers_mask_show(m, memcg->tier_mask);
+ swap_tiers_mask_show(m, memcg->tier_effective_mask);
+
+ return 0;
+}
+
+static ssize_t swap_tier_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ char *pos, *token;
+ int ret = 0;
+ int original_mask;
+
+ pos = strstrip(buf);
+
+ spin_lock(&swap_tier_lock);
+ if (!*pos) {
+ memcg->tier_mask = TIER_ALL_MASK;
+ goto sync;
+ }
+
+ original_mask = memcg->tier_mask;
+
+ while ((token = strsep(&pos, " \t\n")) != NULL) {
+ int mask;
+
+ if (!*token)
+ continue;
+
+ if (token[0] != '-' && token[0] != '+') {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ mask = swap_tiers_mask_lookup(token+1);
+ if (!mask) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /*
+ * if child already set, cannot add that tiers for hierarch mismatching.
+ * parent compatible, child must respect parent selected swap device.
+ */
+ switch (token[0]) {
+ case '-':
+ memcg->tier_mask &= ~mask;
+ break;
+ case '+':
+ memcg->tier_mask |= mask;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ if (ret)
+ goto err;
+ }
+
+sync:
+ __swap_tiers_memcg_sync_mask(memcg);
+err:
+ if (ret)
+ memcg->tier_mask = original_mask;
+ spin_unlock(&swap_tier_lock);
+ return ret ? ret : nbytes;
+}
+
static struct cftype swap_files[] = {
{
.name = "swap.current",
@@ -5384,6 +5463,12 @@ static struct cftype swap_files[] = {
.file_offset = offsetof(struct mem_cgroup, swap_events_file),
.seq_show = swap_events_show,
},
+ {
+ .name = "swap.tiers",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_tier_show,
+ .write = swap_tier_write,
+ },
{ } /* terminate */
};
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d46ca61d2e42..c0dcab74779d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -961,6 +961,8 @@ static ssize_t tiers_store(struct kobject *kobj,
char *p, *token, *name, *tmp;
int ret = 0;
short prio;
+ int mask = 0;
+
DEFINE_SWAP_TIER_SAVE_CTX(ctx);
tmp = kstrdup(buf, GFP_KERNEL);
@@ -978,7 +980,7 @@ static ssize_t tiers_store(struct kobject *kobj,
continue;
if (token[0] == '-') {
- ret = swap_tiers_remove(token + 1);
+ ret = swap_tiers_remove(token + 1, &mask);
} else {
name = strsep(&token, ":");
@@ -997,7 +999,7 @@ static ssize_t tiers_store(struct kobject *kobj,
goto restore;
}
- if (!swap_tiers_update()) {
+ if (!swap_tiers_update(mask)) {
ret = -EINVAL;
goto restore;
}
diff --git a/mm/swap_tier.c b/mm/swap_tier.c
index 7741214312c7..0e067ba545cb 100644
--- a/mm/swap_tier.c
+++ b/mm/swap_tier.c
@@ -232,7 +232,7 @@ int swap_tiers_add(const char *name, int prio)
return ret;
}
-int swap_tiers_remove(const char *name)
+int swap_tiers_remove(const char *name, int *mask)
{
int ret = 0;
struct swap_tier *tier;
@@ -255,6 +255,8 @@ int swap_tiers_remove(const char *name)
list_prev_entry(tier, list)->prio = DEF_SWAP_PRIO;
list_move(&tier->list, &swap_tier_inactive_list);
+ *mask |= TIER_MASK(tier);
+
return ret;
}
@@ -351,7 +353,17 @@ void swap_tiers_assign_dev(struct swap_info_struct *swp)
swp->tier_mask = TIER_DEFAULT_MASK;
}
-bool swap_tiers_update(void)
+static void swap_tier_memcg_propagate(int mask)
+{
+ struct mem_cgroup *child;
+
+ for_each_mem_cgroup_tree(child, root_mem_cgroup) {
+ child->tier_mask |= mask;
+ child->tier_effective_mask |= mask;
+ }
+}
+
+bool swap_tiers_update(int mask)
{
struct swap_tier *tier;
struct swap_info_struct *swp;
@@ -379,6 +391,79 @@ bool swap_tiers_update(void)
break;
swap_tiers_assign_dev(swp);
}
+ /*
+ * XXX: Unused tiers default to ON, disabled after next tier added.
+ * Use removed tier mask to clear settings for removed/re-added tiers.
+ * (Could hold tier refs, but better to keep cgroup config independent)
+ */
+ if (mask)
+ swap_tier_memcg_propagate(mask);
return true;
}
+
+void swap_tiers_mask_show(struct seq_file *m, int mask)
+{
+ struct swap_tier *tier;
+
+ spin_lock(&swap_tier_lock);
+ for_each_active_tier(tier) {
+ if (mask & TIER_MASK(tier))
+ seq_printf(m, "%s ", tier->name);
+ }
+ spin_unlock(&swap_tier_lock);
+ seq_puts(m, "\n");
+}
+
+int swap_tiers_mask_lookup(const char *name)
+{
+ struct swap_tier *tier;
+
+ lockdep_assert_held(&swap_tier_lock);
+
+ for_each_active_tier(tier) {
+ if (!strcmp(name, tier->name))
+ return TIER_MASK(tier);
+ }
+
+ return 0;
+}
+
+static void __swap_tier_memcg_inherit_mask(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent)
+{
+ int effective_mask
+ = parent ? parent->tier_effective_mask : TIER_ALL_MASK;
+
+ memcg->tier_effective_mask
+ = effective_mask & memcg->tier_mask;
+}
+
+void swap_tiers_memcg_inherit_mask(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent)
+{
+ spin_lock(&swap_tier_lock);
+ __swap_tier_memcg_inherit_mask(memcg, parent);
+ spin_unlock(&swap_tier_lock);
+}
+
+void __swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *child;
+
+ lockdep_assert_held(&swap_tier_lock);
+
+ if (memcg == root_mem_cgroup)
+ return;
+
+ for_each_mem_cgroup_tree(child, memcg)
+ __swap_tier_memcg_inherit_mask(child, parent_mem_cgroup(child));
+}
+
+void swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg)
+{
+ spin_lock(&swap_tier_lock);
+ memcg->tier_mask = TIER_ALL_MASK;
+ __swap_tiers_memcg_sync_mask(memcg);
+ spin_unlock(&swap_tier_lock);
+}
diff --git a/mm/swap_tier.h b/mm/swap_tier.h
index de81d540e3b5..9024c82c807a 100644
--- a/mm/swap_tier.h
+++ b/mm/swap_tier.h
@@ -31,19 +31,54 @@ struct swap_tier_save_ctx {
#define TIER_DEFAULT_IDX (31)
#define TIER_DEFAULT_MASK (1 << TIER_DEFAULT_IDX)
+#ifdef CONFIG_MEMCG
+static inline int folio_tier_effective_mask(struct folio *folio)
+{
+ struct mem_cgroup *memcg = folio_memcg(folio);
+
+ return memcg ? memcg->tier_effective_mask : TIER_ALL_MASK;
+}
+#else
+static inline int folio_tier_effective_mask(struct folio *folio)
+{
+ return TIER_ALL_MASK;
+}
+#endif
+
/* Initialization and application */
void swap_tiers_init(void);
ssize_t swap_tiers_sysfs_show(char *buf);
int swap_tiers_add(const char *name, int prio);
-int swap_tiers_remove(const char *name);
+int swap_tiers_remove(const char *name, int *mask);
int swap_tiers_modify(const char *name, int prio);
void swap_tiers_save(struct swap_tier_save_ctx ctx[]);
void swap_tiers_restore(struct swap_tier_save_ctx ctx[]);
-bool swap_tiers_update(void);
+bool swap_tiers_update(int mask);
/* Tier assignment */
void swap_tiers_assign_dev(struct swap_info_struct *swp);
+/* Memcg related functions */
+void swap_tiers_mask_show(struct seq_file *m, int mask);
+void swap_tiers_memcg_inherit_mask(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent);
+void swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg);
+void __swap_tiers_memcg_sync_mask(struct mem_cgroup *memcg);
+
+/* Mask and tier lookup */
+int swap_tiers_mask_lookup(const char *name);
+
+/**
+ * swap_tiers_mask_test - Check if the tier mask is valid
+ * @tier_mask: The tier mask to check
+ * @mask: The mask to compare against
+ *
+ * Return: true if condition matches, false otherwise
+ */
+static inline bool swap_tiers_mask_test(int tier_mask, int mask)
+{
+ return tier_mask & mask;
+}
#endif /* _SWAP_TIER_H */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4f8ce021c5bd..e04811e10431 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1348,10 +1348,14 @@ static bool swap_alloc_fast(struct folio *folio)
static void swap_alloc_slow(struct folio *folio)
{
struct swap_info_struct *si, *next;
+ int mask = folio_tier_effective_mask(folio);
spin_lock(&swap_avail_lock);
start_over:
plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) {
+ if (!swap_tiers_mask_test(si->tier_mask, mask))
+ continue;
+
/* Rotate the device and switch to a new cluster */
plist_requeue(&si->avail_list, &swap_avail_head);
spin_unlock(&swap_avail_lock);
--
2.34.1
Hi. This is merely the API feedback. (Feedback to the propsed form, I'm not sure whether/how this should interact with memory.swap.max (formally cf io.weight).) On Sat, Jan 31, 2026 at 09:54:52PM +0900, Youngjun Park <youngjun.park@lge.com> wrote: > This patch integrates the swap tier infrastructure with cgroup, > enabling the selection of specific swap devices per cgroup by > configuring allowed swap tiers. > > The new `memory.swap.tiers` interface controls allowed swap tiers via a mask. > By default, the mask is set to include all tiers, allowing specific tiers to > be excluded or restored. Note that effective tiers are calculated separately > using a dedicated mask to respect the cgroup hierarchy. Consequently, > configured tiers may differ from effective ones, as they must be a subset > of the parent's. > > Note that cgroups do not pin swap tiers. This is similar to the > `cpuset` controller, which does not prevent CPU hotplug. This > approach ensures flexibility by allowing tier configuration changes > regardless of cgroup usage. > > Signed-off-by: Youngjun Park <youngjun.park@lge.com> > --- > Documentation/admin-guide/cgroup-v2.rst | 27 ++++++++ > include/linux/memcontrol.h | 3 +- > mm/memcontrol.c | 85 +++++++++++++++++++++++ > mm/swap_state.c | 6 +- > mm/swap_tier.c | 89 ++++++++++++++++++++++++- > mm/swap_tier.h | 39 ++++++++++- > mm/swapfile.c | 4 ++ > 7 files changed, 246 insertions(+), 7 deletions(-) > > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst > index 7f5b59d95fce..776a908ce1b9 100644 > --- a/Documentation/admin-guide/cgroup-v2.rst > +++ b/Documentation/admin-guide/cgroup-v2.rst > @@ -1848,6 +1848,33 @@ The following nested keys are defined. > Swap usage hard limit. If a cgroup's swap usage reaches this > limit, anonymous memory of the cgroup will not be swapped out. > > + memory.swap.tiers > + A read-write nested-keyed file which exists on non-root "nested-keyed" format is something else in this document's lingo, see e.g. io.stat. I think you wanted to make this resemble cgroup.subtree_control (which is fine). > + cgroups. The default is to enable all tiers. > + > + This interface allows selecting which swap tiers a cgroup can > + use for swapping out memory. > + > + The effective tiers are inherited from the parent. Only tiers > + effective in the parent can be effective in the child. However, > + the child can explicitly disable tiers allowed by the parent. > + > + When read, the file shows two lines: > + - The first line shows the operation string that was > + written to this file. > + - The second line shows the effective operation after > + merging with parent settings. The convention (in cpuset) is to split it in two files like memory.swap.tiers and memory.swap.tiers.effective. > + > + When writing, the format is: > + (+/-)(TIER_NAME) (+/-)(TIER_NAME) ... > + > + Valid tier names are those configured in > + /sys/kernel/mm/swap/tiers. > + > + Each tier can be prefixed with: > + + Enable this tier > + - Disable this tier > + I believe these are only superficial adjustments not affecting the implementation. Thanks, Michal
On Tue, Feb 03, 2026 at 11:54:41AM +0100, Michal Koutný wrote: > Hi. > > This is merely the API feedback. > > (Feedback to the propsed form, I'm not sure whether/how this should > interact with memory.swap.max (formally cf io.weight).) > > On Sat, Jan 31, 2026 at 09:54:52PM +0900, Youngjun Park <youngjun.park@lge.com> wrote: > > This patch integrates the swap tier infrastructure with cgroup, > > enabling the selection of specific swap devices per cgroup by > > configuring allowed swap tiers. > > > > The new `memory.swap.tiers` interface controls allowed swap tiers via a mask. > > By default, the mask is set to include all tiers, allowing specific tiers to > > be excluded or restored. Note that effective tiers are calculated separately > > using a dedicated mask to respect the cgroup hierarchy. Consequently, > > configured tiers may differ from effective ones, as they must be a subset > > of the parent's. > > > > Note that cgroups do not pin swap tiers. This is similar to the > > `cpuset` controller, which does not prevent CPU hotplug. This > > approach ensures flexibility by allowing tier configuration changes > > regardless of cgroup usage. > > > > Signed-off-by: Youngjun Park <youngjun.park@lge.com> > > --- > > Documentation/admin-guide/cgroup-v2.rst | 27 ++++++++ > > include/linux/memcontrol.h | 3 +- > > mm/memcontrol.c | 85 +++++++++++++++++++++++ > > mm/swap_state.c | 6 +- > > mm/swap_tier.c | 89 ++++++++++++++++++++++++- > > mm/swap_tier.h | 39 ++++++++++- > > mm/swapfile.c | 4 ++ > > 7 files changed, 246 insertions(+), 7 deletions(-) > > > > diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst > > index 7f5b59d95fce..776a908ce1b9 100644 > > --- a/Documentation/admin-guide/cgroup-v2.rst > > +++ b/Documentation/admin-guide/cgroup-v2.rst > > @@ -1848,6 +1848,33 @@ The following nested keys are defined. > > Swap usage hard limit. If a cgroup's swap usage reaches this > > limit, anonymous memory of the cgroup will not be swapped out. > > > > + memory.swap.tiers > > + A read-write nested-keyed file which exists on non-root > > "nested-keyed" format is something else in this document's lingo, see > e.g. io.stat. > > I think you wanted to make this resemble cgroup.subtree_control (which > is fine). You are right, I used the wrong expression. Simply describing it as a "file" seems sufficient. > > > + cgroups. The default is to enable all tiers. > > + > > + This interface allows selecting which swap tiers a cgroup can > > + use for swapping out memory. > > + > > + The effective tiers are inherited from the parent. Only tiers > > + effective in the parent can be effective in the child. However, > > + the child can explicitly disable tiers allowed by the parent. > > + > > + When read, the file shows two lines: > > + - The first line shows the operation string that was > > + written to this file. > > + - The second line shows the effective operation after > > + merging with parent settings. > > The convention (in cpuset) is to split it in two files like > memory.swap.tiers and memory.swap.tiers.effective. I will separate the two according to the convention. Thanks for correction. > > + > > + When writing, the format is: > > + (+/-)(TIER_NAME) (+/-)(TIER_NAME) ... > > + > > + Valid tier names are those configured in > > + /sys/kernel/mm/swap/tiers. > > + > > + Each tier can be prefixed with: > > + + Enable this tier > > + - Disable this tier > > + > > I believe these are only superficial adjustments not affecting the > implementation. > > Thanks, > Michal Thanks for the review, Michal. Youngjun Park
© 2016 - 2026 Red Hat, Inc.