From: "youngjun.park" <youngjun.park@lge.com>
We are working in a constrained environment where devices often
operate under limited resources. To improve overall system responsiveness,
especially under memory pressure, we aim to utilize idle devices as swap
targets over the network.
In this context, we propose a mechanism to control swap priorities on a
per-cgroup basis.
By assigning different swap priorities to each cgroup, we can ensure that
ciritical applications maintain higher responsiveness and stability,
while less important workloads experience deferred swap activity.
The following is detailed explanation of the implementation.
1. Object Description
- swap_cgroup_priority
This object manages an array of swap_cgroup_priority_pnode
that points to swap devices and their associated priorities.
- swap_cgroup_priority_pnode
This object points to a swap device and contains priority information
that can be allocated through an interface.
2. Object Lifecycle
- The swap_cgroup_priority and swap_cgroup_priority_pnode share the same
lifetime.
- Object is dealt with memory.swap.priority interface.
Each swap device is assigned a unique ID at swapon time,
which can be queried via the memory.swap.priority interface.
Example:
cat memory.swap.priority
Inactive
/dev/sdb unique:1 prio:10
/dev/sdc unique:2 prio:5
- Creation
echo "unique id of swapdev 1: priority, unique id of swapdev 2: priority ..."
> memory.swap.priority
- Destruction
Reset through the memory.swap.priority interface.
Example: echo "" > memory.swap.priority
And also be destroyed when the mem_cgroup is removed.
3. Priority Mechanism
- Follows the original concept of swap priority.
(This includes automatic binding of swap devices to NUMA nodes.)
- Swap On/Off Propagation
When swapon is executed, the settings are propagated.
Also when swapoff is executed, the settings are removed.
The implementation of swap on/off propagation and the mechanism
for iterating through the configured swap cgroup priorities
are available in the next patch.
Signed-off-by: Youngjun Park <youngjun.park@lge.com>
Suggested-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
---
include/linux/memcontrol.h | 3 +
include/linux/swap.h | 3 +
mm/Kconfig | 7 ++
mm/memcontrol.c | 55 ++++++++++
mm/swap.h | 10 ++
mm/swap_cgroup_priority.c | 202 +++++++++++++++++++++++++++++++++++++
mm/swapfile.c | 6 ++
7 files changed, 286 insertions(+)
create mode 100644 mm/swap_cgroup_priority.c
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 87b6688f124a..625e59f9ecd2 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -218,6 +218,9 @@ struct mem_cgroup {
bool zswap_writeback;
#endif
+#ifdef CONFIG_SWAP_CGROUP_PRIORITY
+ struct swap_cgroup_priority *swap_priority;
+#endif
/* vmpressure notifications */
struct vmpressure vmpressure;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index bc0e1c275fc0..49b73911c1bd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -339,6 +339,9 @@ struct swap_info_struct {
struct work_struct discard_work; /* discard worker */
struct work_struct reclaim_work; /* reclaim worker */
struct list_head discard_clusters; /* discard clusters list */
+#ifdef CONFIG_SWAP_CGROUP_PRIORITY
+ int unique_id;
+#endif
struct plist_node avail_lists[]; /*
* entries in swap_avail_heads, one
* entry per node.
diff --git a/mm/Kconfig b/mm/Kconfig
index 781be3240e21..ff4b0ef867f4 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -190,6 +190,13 @@ config ZSMALLOC_CHAIN_SIZE
For more information, see zsmalloc documentation.
+config SWAP_CGROUP_PRIORITY
+ bool "Use swap cgroup priority"
+ default false
+ depends on SWAP && CGROUPS
+ help
+ This option sets per cgroup swap device priority.
+
menu "Slab allocator options"
config SLUB
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 902da8a9c643..628ffb048489 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -69,6 +69,7 @@
#include <net/ip.h>
#include "slab.h"
#include "memcontrol-v1.h"
+#include "swap.h"
#include <linux/uaccess.h>
@@ -3702,6 +3703,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
{
lru_gen_exit_memcg(memcg);
memcg_wb_domain_exit(memcg);
+ delete_swap_cgroup_priority(memcg);
__mem_cgroup_free(memcg);
}
@@ -5403,6 +5405,51 @@ static int swap_events_show(struct seq_file *m, void *v)
return 0;
}
+#ifdef CONFIG_SWAP_CGROUP_PRIORITY
+static ssize_t swap_cgroup_priority_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ int ret;
+ int unique[MAX_SWAPFILES] = {0, };
+ int prios[MAX_SWAPFILES] = {0,};
+ int idx = 0;
+ char *token;
+
+ buf = strstrip(buf);
+ if (strlen(buf) == 0) {
+ delete_swap_cgroup_priority(memcg);
+ return nbytes;
+ }
+
+ while ((token = strsep(&buf, ",")) != NULL) {
+ char *token2 = token;
+ char *token3;
+
+ token3 = strsep(&token2, ":");
+ if (!token2 || !token3)
+ return -EINVAL;
+
+ if (kstrtoint(token3, 10, &unique[idx]) ||
+ kstrtoint(token2, 10, &prios[idx]))
+ return -EINVAL;
+
+ idx++;
+ }
+
+ if ((ret = create_swap_cgroup_priority(memcg, unique, prios, idx)))
+ return ret;
+
+ return nbytes;
+}
+
+static int swap_cgroup_priority_show(struct seq_file *m, void *v)
+{
+ show_swap_device_unique_id(m);
+ return 0;
+}
+#endif
+
static struct cftype swap_files[] = {
{
.name = "swap.current",
@@ -5435,6 +5482,14 @@ static struct cftype swap_files[] = {
.file_offset = offsetof(struct mem_cgroup, swap_events_file),
.seq_show = swap_events_show,
},
+#ifdef CONFIG_SWAP_CGROUP_PRIORITY
+ {
+ .name = "swap.priority",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_cgroup_priority_show,
+ .write = swap_cgroup_priority_write,
+ },
+#endif
{ } /* terminate */
};
diff --git a/mm/swap.h b/mm/swap.h
index 2269eb9df0af..cd2649c632ed 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -106,6 +106,16 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
return find_next_bit(sis->zeromap, end, start) - start;
}
+#ifdef CONFIG_SWAP_CGROUP_PRIORITY
+int create_swap_cgroup_priority(struct mem_cgroup *memcg,
+ int unique[], int prio[], int nr);
+void delete_swap_cgroup_priority(struct mem_cgroup *memcg);
+void show_swap_device_unique_id(struct seq_file *m);
+#else
+static inline void delete_swap_cgroup_priority(struct mem_cgroup *memcg) {}
+static inline void get_swap_unique_id(struct swap_info_struct *si) {}
+#endif
+
#else /* CONFIG_SWAP */
struct swap_iocb;
static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
diff --git a/mm/swap_cgroup_priority.c b/mm/swap_cgroup_priority.c
new file mode 100644
index 000000000000..b3e20b676680
--- /dev/null
+++ b/mm/swap_cgroup_priority.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* per mem_cgroup */
+struct swap_cgroup_priority {
+ struct list_head link;
+ /* XXX: to flatten memory is hard. variable array is our enemy */
+ struct swap_cgroup_priority_pnode *pnode[MAX_SWAPFILES];
+ struct plist_head plist[];
+};
+
+/* per mem_cgroup & per swap device node */
+struct swap_cgroup_priority_pnode {
+ struct swap_info_struct *swap;
+ int prio;
+ struct plist_node avail_lists[];
+};
+
+/* per swap device unique id counter */
+static atomic_t swap_unique_id_counter;
+
+/* active swap_cgroup_priority list */
+static LIST_HEAD(swap_cgroup_priority_list);
+
+/* XXX: Not want memcontrol to know swap_cgroup_priority internal. */
+void show_swap_device_unique_id(struct seq_file *m)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ spin_lock(&swap_lock);
+ /* XXX: what is beautiful visibility? */
+ seq_printf(m, "%s\n", memcg->swap_priority ? "Active" : "Inactive");
+ for (int i = 0; i < nr_swapfiles; i++) {
+ struct swap_info_struct *si = swap_info[i];
+
+ if (!(si->flags & SWP_USED))
+ continue;
+
+ seq_file_path(m, si->swap_file, "\t\n\\");
+ seq_printf(m, "\tunique:%d\t", si->unique_id);
+
+ if (!memcg->swap_priority) {
+ seq_printf(m, " prio:%d\n", si->prio);
+ continue;
+ }
+
+ seq_printf(m, "prio:%d\n",
+ memcg->swap_priority->pnode[i]->prio);
+ }
+ spin_unlock(&swap_lock);
+}
+
+static void get_swap_unique_id(struct swap_info_struct *si)
+{
+ si->unique_id = atomic_add_return(1, &swap_unique_id_counter);
+}
+
+int create_swap_cgroup_priority(struct mem_cgroup *memcg,
+ int unique[], int prio[], int nr)
+{
+ bool b_found = false;
+ struct swap_cgroup_priority *swap_priority, *old_swap_priority = NULL;
+ int nid;
+
+ /* Fast check */
+ if (nr != nr_swapfiles)
+ return -EINVAL;
+
+ /*
+ * XXX: always make newly object and exchange it.
+ * possible to give object reusability if it is simple and better.
+ */
+ swap_priority = kvmalloc(struct_size(swap_priority, plist, nr_node_ids),
+ GFP_KERNEL);
+
+ if (!swap_priority)
+ return -ENOMEM;
+
+ /* XXX: use pre allocate. think swapon time allocate is better? */
+ for (int i = 0; i < MAX_SWAPFILES; i++) {
+ swap_priority->pnode[i] =
+ kvmalloc(struct_size(swap_priority->pnode[0],
+ avail_lists, nr_node_ids),
+ GFP_KERNEL);
+
+ if (!swap_priority->pnode[i]) {
+ for (int j = 0; j < i; j++)
+ kvfree(swap_priority->pnode[i]);
+
+ kvfree(swap_priority);
+ return -ENOMEM;
+ }
+ }
+
+ INIT_LIST_HEAD(&swap_priority->link);
+ for_each_node(nid)
+ plist_head_init(&swap_priority->plist[nid]);
+
+ spin_lock(&swap_lock);
+ spin_lock(&swap_avail_lock);
+
+ /* swap on/off under us. */
+ if (nr != nr_swapfiles)
+ goto error;
+
+ /* TODO: naive search. make it fast.*/
+ for (int i = 0; i < nr; i++) {
+ b_found = false;
+ for (int j = 0; j < nr_swapfiles; j++) {
+ struct swap_info_struct *si = swap_info[j];
+ struct swap_cgroup_priority_pnode *pnode
+ = swap_priority->pnode[j];
+
+ if (si->unique_id != unique[i])
+ continue;
+
+ /* swap off under us */
+ if (!(si->flags & SWP_USED))
+ goto error;
+
+ int k;
+ for_each_node(k) {
+ if (prio[i] >= 0) {
+ pnode->prio = prio[i];
+ plist_node_init(&pnode->avail_lists[k],
+ -pnode->prio);
+ } else {
+ pnode->prio = si->prio;
+ if (swap_node(si) == k)
+ plist_node_init(
+ &pnode->avail_lists[k],
+ 1);
+ else
+ plist_node_init(
+ &pnode->avail_lists[k],
+ -pnode->prio);
+ }
+
+ plist_add(&pnode->avail_lists[k],
+ &swap_priority->plist[k]);
+ }
+
+ pnode->swap = si;
+ b_found = true;
+ break;
+ }
+
+ /* cannot find unique id pair */
+ if (!b_found)
+ goto error;
+ }
+
+ if (memcg->swap_priority) {
+ old_swap_priority = memcg->swap_priority;
+ list_del(&old_swap_priority->link);
+ }
+
+ list_add(&swap_priority->link, &swap_cgroup_priority_list);
+
+ memcg->swap_priority = swap_priority;
+ spin_unlock(&swap_avail_lock);
+ spin_unlock(&swap_lock);
+
+ if (old_swap_priority) {
+ for (int i = 0; i < MAX_SWAPFILES; i++)
+ kvfree(old_swap_priority->pnode[i]);
+ kvfree(old_swap_priority);
+ }
+
+ return 0;
+
+error:
+ spin_unlock(&swap_avail_lock);
+ spin_unlock(&swap_lock);
+
+ for (int i = 0; i < MAX_SWAPFILES; i++)
+ kvfree(swap_priority->pnode[i]);
+ kvfree(swap_priority);
+
+ return -EINVAL;
+}
+
+void delete_swap_cgroup_priority(struct mem_cgroup *memcg)
+{
+ struct swap_cgroup_priority *swap_priority;
+
+ spin_lock(&swap_avail_lock);
+ swap_priority = memcg->swap_priority;
+ if (!swap_priority) {
+ spin_unlock(&swap_avail_lock);
+ return;
+ }
+ memcg->swap_priority = NULL;
+ list_del(&swap_priority->link);
+ spin_unlock(&swap_avail_lock);
+
+ /* wait show_swap_device_unique_id */
+ synchronize_rcu();
+
+ for (int i = 0; i < MAX_SWAPFILES; i++)
+ kvfree(swap_priority->pnode[i]);
+ kvfree(swap_priority);
+}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 68ce283e84be..f8e48dd2381e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -126,6 +126,10 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
.offset = { SWAP_ENTRY_INVALID },
.lock = INIT_LOCAL_LOCK(),
};
+/* TODO: better choice? */
+#ifdef CONFIG_SWAP_CGROUP_PRIORITY
+#include "swap_cgroup_priority.c"
+#endif
static struct swap_info_struct *swap_type_to_swap_info(int type)
{
@@ -3462,6 +3466,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto free_swap_zswap;
}
+ get_swap_unique_id(si);
+
mutex_lock(&swapon_mutex);
prio = -1;
if (swap_flags & SWAP_FLAG_PREFER)
--
2.34.1
Hello. On Thu, Jun 12, 2025 at 07:37:43PM +0900, youngjun.park@lge.com wrote: > Example: > cat memory.swap.priority > Inactive > /dev/sdb unique:1 prio:10 > /dev/sdc unique:2 prio:5 > > - Creation > echo "unique id of swapdev 1: priority, unique id of swapdev 2: priority ..." > > memory.swap.priority > > - Destruction > Reset through the memory.swap.priority interface. > Example: echo "" > memory.swap.priority > > And also be destroyed when the mem_cgroup is removed. > > 3. Priority Mechanism > > - Follows the original concept of swap priority. > (This includes automatic binding of swap devices to NUMA nodes.) How is this supposed to work cg1 /dev/sda prio:10 /dev/sdb prio:5 ` cg3 /dev/sda prio:5 /dev/sdb prio:10 cg2 /dev/sda prio:5 /dev/sdb prio:10 ` cg4 /dev/sda prio:10 /dev/sdb prio:5 when there are competitors from cg3 and cg4? Which device should be preferred by each cgroup? Interface note -- try to make it "Nested keyed" or "Flat keyed" as described in Documentation/admin-guide/cgroup-v2.rst (like io.max or io.weight), so that it is consistent with other cgroup v2 APIs. HTH, Michal
On Tue, Jun 17, 2025 at 02:23:07PM +0200, Michal Koutný wrote: > Hello. > > On Thu, Jun 12, 2025 at 07:37:43PM +0900, youngjun.park@lge.com wrote: > > Example: > > cat memory.swap.priority > > Inactive > > /dev/sdb unique:1 prio:10 > > /dev/sdc unique:2 prio:5 > > > > - Creation > > echo "unique id of swapdev 1: priority, unique id of swapdev 2: priority ..." > > > memory.swap.priority > > > > - Destruction > > Reset through the memory.swap.priority interface. > > Example: echo "" > memory.swap.priority > > > > And also be destroyed when the mem_cgroup is removed. > > > > 3. Priority Mechanism > > > > - Follows the original concept of swap priority. > > (This includes automatic binding of swap devices to NUMA nodes.) > > How is this supposed to work > cg1 /dev/sda prio:10 > /dev/sdb prio:5 > ` cg3 /dev/sda prio:5 > /dev/sdb prio:10 > cg2 /dev/sda prio:5 > /dev/sdb prio:10 > ` cg4 /dev/sda prio:10 > /dev/sdb prio:5 > > when there are competitors from cg3 and cg4? Which device should be > preferred by each cgroup? Hello Michal. What issue is the question assuming the existence of competitors in two cgroups trying to address? Could you explain it a bit more specifically? To answer your question for now, Each cgroup just prefers devices according to their priority values. until swap device is exhausted. cg1 prefer /dev/sda than /dev/sdb. cg2 prefer /dev/sdb than /dev/sda. cg3 prefer /dev/sdb than /dev/sda. cg4 prefer /dev/sda than /dev/sdb. > Interface note -- try to make it "Nested keyed" or "Flat keyed" as > described in Documentation/admin-guide/cgroup-v2.rst (like io.max or > io.weight), so that it is consistent with other cgroup v2 APIs. Yes, it looks like the API format should be adjusted as you suggested. Thanks for the review. Regards, Youngjun Park
On Wed, Jun 18, 2025 at 09:32:13AM +0900, YoungJun Park <youngjun.park@lge.com> wrote: > What issue is the question assuming the existence of competitors in two > cgroups trying to address? Could you explain it a bit more specifically? I'm after how this mechanism is supposed to honor hierarchical structure. (I thought the numeric example was the most specific.) > > To answer your question for now, > Each cgroup just prefers devices according to their priority values. > until swap device is exhausted. > > cg1 prefer /dev/sda than /dev/sdb. > cg2 prefer /dev/sdb than /dev/sda. > cg3 prefer /dev/sdb than /dev/sda. > cg4 prefer /dev/sda than /dev/sdb. Hm, than means the settigs from cg1 (or cg2) don't apply to descendant cg3 (or cg4) :-/ When referring to that document (Documentation/admin-guide/cgroup-v2.rst) again, which of the "Resource Distribution Models" do you find the most fitting for this scenario? Thanks, Michal
On Wed, Jun 18, 2025 at 11:11:32AM +0200, Michal Koutný wrote: > On Wed, Jun 18, 2025 at 09:32:13AM +0900, YoungJun Park <youngjun.park@lge.com> wrote: > > What issue is the question assuming the existence of competitors in two > > cgroups trying to address? Could you explain it a bit more specifically? > > I'm after how this mechanism is supposed to honor hierarchical > structure. (I thought the numeric example was the most specific.) > > > > > To answer your question for now, > > Each cgroup just prefers devices according to their priority values. > > until swap device is exhausted. > > > > cg1 prefer /dev/sda than /dev/sdb. > > cg2 prefer /dev/sdb than /dev/sda. > > cg3 prefer /dev/sdb than /dev/sda. > > cg4 prefer /dev/sda than /dev/sdb. > > Hm, than means the settigs from cg1 (or cg2) don't apply to descendant > cg3 (or cg4) :-/ I've been thinking about whether the use case I suggested aligns with the philosophy of cgroups, and I believe there are two feasible directions could take (This still needs some detailed refinement.) Bascially on two strategies, child inherits parent setting. 1. Preserve the order of priorities and type of swap devices when a child cgroup inherits values from its parent. the inherited order must be strictly maintained e.g 1.1 possible case. 1.1.1 cgroupA (swapA-swapB-swapC) ' cgroupB (swapA-swapC) 1.1.2 cgroupA (swapA-swapB-swapC) ' cgroupB (swapA-swapC) after time, modify it (swapD add on cgroupA) cgroupA (swapA-swapB-swapC-swapD) ' cgroupB (swapA-swapC) 1.2.impossible case. 1.2.1 violate the order of priorities rule. cgroupA (swapA-swapB-swapC) ' cgroupB (swapC-swapA-swapB) 1.2.2 violate the type of swap devices rule. cgroupA (swapA-swapB-swapC) ' cgroupB (swapD) 2. Restrict child cgroups to only use values inherited from the parent, without allowing them to define their own setting. e.g cgroupA (swapA-swapB-swapC) ' cgroupB (swapA-swapB-swapC) after time, modify it (swapD add on cgroupA) cgroupA (swapA-swapB-swapC-swapD) ' cgroupB (swapA-swapB-swapC-swapD) it is different from 1.1.2 case swapD propagated. (because child and parent must be same) > When referring to that document > (Documentation/admin-guide/cgroup-v2.rst) again, which of the "Resource > Distribution Models" do you find the most fitting for this scenario? I initially submitted the RFC from the perspective that each in-use swap device must explicitly have a priority assigned, including propagation at swapon time. (for avoiding swap-fail by using this mechanism) However, condisering the resource distribution model you mentioned, I now see that not requiring all swap devices to have an explicitly defined priority aligns better with the broader cgroup "limit distribution" philosophy, particularly in terms of limiting and distributing resources. This is because cgroups can still restrict swap device usage and control device order without requiring explicit priorities for all devices. In this view, the cgroup interface serves more as a limit or preference mechanism across the full set of available swap devices, rather than requiring full enumeration and configuration. Regards, Youngjun Park
On Wed, Jun 18, 2025 at 09:07:51PM +0900, YoungJun Park <youngjun.park@lge.com> wrote: > This is because cgroups can still restrict swap device usage and control > device order without requiring explicit priorities for all devices. > In this view, the cgroup interface serves more as a limit or preference > mechanism across the full set of available swap devices, rather than > requiring full enumeration and configuration. I was wondering whether your use cases would be catered by having memory.swap.max limit per device (essentially disable swap to undesired device(s) for given group). The disadvantage is that memory.swap.max is already existing as scalar. Alternatively, remapping priorities to memory.swap.weight -- with sibling vs sibling competition and children treated with weight of parent when approached from the top. I find this weight semantics little weird as it'd clash with other .weight which are dual to this (cgroups compete over one device vs cgroup is choosing between multiple devices). Please try to take the existing distribution models into account not to make something overly unidiomatic, Michal
On Mon, Jun 30, 2025 at 07:39:47PM +0200, Michal Koutný wrote: > On Wed, Jun 18, 2025 at 09:07:51PM +0900, YoungJun Park <youngjun.park@lge.com> wrote: > > This is because cgroups can still restrict swap device usage and control > > device order without requiring explicit priorities for all devices. > > In this view, the cgroup interface serves more as a limit or preference > > mechanism across the full set of available swap devices, rather than > > requiring full enumeration and configuration. Hello Michal, Thank you very much for your thoughtful review and for sharing your insights. I’d like to share my thoughts and the reasoning behind my current direction, including some points I considered in relation to your suggestions. > I was wondering whether your use cases would be catered by having > memory.swap.max limit per device (essentially disable swap to undesired > device(s) for given group). The disadvantage is that memory.swap.max is > already existing as scalar. Alternatively, remapping priorities to I did consider implementing this kind of control. In that design, it would work similarly to memory.swap.max but per device: the implementation would iterate through the swap devices in priority order and maintain per-cgroup counters for each device’s usage. It would also need to handle proper counter cleanup after use, and ensure that usage checks also happen on the fastpath where per-CPU caches for swap device clusters come into play. From a runtime behavior perspective, the priority-based approach seemed preferable, as it allows more flexible control: the configured cgroup can strongly prefer the desired device and benefit from faster selection at allocation time. I also considered how this would coexist with the existing swap.max interface, but given the additional implementation and runtime overhead this would introduce, I decided to hold it back and chose a priority-based approach instead. > already existing as scalar. Alternatively, remapping priorities to > memory.swap.weight -- with sibling vs sibling competition and children > treated with weight of parent when approached from the top. I find this > weight semantics little weird as it'd clash with other .weight which are > dual to this (cgroups compete over one device vs cgroup is choosing > between multiple devices). Your point about the semantic mismatch is very valid. I agree that reusing .weight semantics here could be confusing: .weight usually expresses competition among siblings for a shared resource, whereas here, the goal is to steer selection among multiple devices within a single cgroup’s scope. The swap priority concept already exists as an independent mechanism, so mapping it into a .weight field might not align well in practice. > Please try to take the existing distribution models into account not to > make something overly unidiomatic, I also thought about possible alignment with existing mechanisms like zswap.writeback. One alternative could be to adopt an on/off style mechanism similar to zswap.writeback including propagation strategy. On implementation-wise, this could be handled by including or excluding devices from the cgroup’s swap device priority list. (The direction I suggested on) However, this approach also has limitations in certain use cases. For example, if we want to enforce a different ordering than the global system swap priority, an on/off switch alone is not sufficient. One possible example would be: (Some cgroup use the slowest available swap device but with a larger capacity avoiding swap failure.) Global swap: A (fast) -> B (slower) -> C (slowest) Cgroup swap: C (slowest) -> B (slower) -> A (fast) This kind of configuration cannot be achieved only with an on/off switch. I think that priority approach might not map perfectly to the existing major distribution models (like limit, weight, etc.), I cautiously see this as an extension of the resource control interfaces, building on the solid foundation that the cgroup mechanism already provides. I am working to ensure that the proposed interface and propagation behavior integrate properly with parent cgroups and follow the same interface style. Here is the current version I am working on now. (It turned out a bit long, but I felt it might be useful to share it with you.) memory.swap.priority A read-write flat-keyed file which exists on non-root cgroups. Example: (after swapon) $ swapon NAME TYPE SIZE USED PRIO /dev/sdb partition 300M 0B 10 /dev/sdc partition 300M 0B 5 /dev/sdd partition 300M 0B -2 To assign priorities to swap devices in the current cgroup, write one or more lines in the following format: <swap_device_unique_id> <priority> Example: (writing priorities) $ echo "1 4" > memory.swap.priority $ echo "2 -2" > memory.swap.priority $ echo "3 -1" > memory.swap.priority Example: (reading after write) $ cat memory.swap.priority 1 4 2 -2 3 -1 The priority semantics are consistent with the global swap system: - Higher values indicate higher preference. - See Documentation/admin-guide/mm/swap_numa.rst for swap numa autobinding. Note: A special value of -1 means the swap device is completely excluded from use by this cgroup. Unlike the global swap priority, where negative values simply lower the priority, setting -1 here disables allocation from that device for the current cgroup only. If any ancestor cgroup has set a swap priority configuration, it is inherited by all descendants. In that case, the child’s own configuration is ignored and the topmost configured ancestor determines the effective priority ordering. memory.swap.priority.effective A read-only file showing the effective swap priority ordering actually applied to this cgroup, after resolving inheritance from ancestors. If there is no configuration in the current cgroup and its ancestors, this file shows the global swap device priority from `swapon`, in the form of unique_id priority pairs. Example: (global only) $ swapon NAME TYPE SIZE USED PRIO /dev/sdb partition 300M 0B 10 /dev/sdc partition 300M 0B 5 /dev/sdd partition 300M 0B -2 $ cat /sys/fs/cgroup/parent/child/memory.swap.priority.effective 1 10 2 5 3 -2 Example: (with parent override) # Parent cgroup configuration $ cat /sys/fs/cgroup/parent/memory.swap.priority 1 4 2 -2 # Child cgroup configuration (ignored because parent overrides) $ cat /sys/fs/cgroup/parent/child/memory.swap.priority 1 8 2 5 # Effective priority seen by the child $ cat /sys/fs/cgroup/parent/child/memory.swap.priority.effective 1 4 2 -2 In this case: - If no cgroup sets any configuration, the output matches the global `swapon` priority. - If an ancestor has a configuration, the child inherits it and ignores its own setting. I hope my explanation clarifies my intention, and I would truly appreciate your positive consideration and any further thoughts you might have. Best regards, Youngjun Park
Hello. On Tue, Jul 01, 2025 at 10:08:46PM +0900, YoungJun Park <youngjun.park@lge.com> wrote: > memory.swap.priority ... > To assign priorities to swap devices in the current cgroup, > write one or more lines in the following format: > > <swap_device_unique_id> <priority> How would the user know this unique_id? (I don't see it in /proc/swaps.) > Note: > A special value of -1 means the swap device is completely > excluded from use by this cgroup. Unlike the global swap > priority, where negative values simply lower the priority, > setting -1 here disables allocation from that device for the > current cgroup only. The divergence from the global semantics is little bit confusing. There should better be a special value (like 'disabled') in the interface. And possible second special value like 'none' that denotes the default (for new (unconfigured) cgroups or when a new swap device is activated). > memory.swap.priority.effective > A read-only file showing the effective swap priority ordering > actually applied to this cgroup, after resolving inheritance > from ancestors. Yes, this'd be definitely useful for troubleshooting and understanding the configurations. ... > In this case: > - If no cgroup sets any configuration, the output matches the > global `swapon` priority. > - If an ancestor has a configuration, the child inherits it > and ignores its own setting. The child's priority could be capped by ancestors' instead of wholy overwritten? (So that remains some effect both.) Thanks, Michal
On Mon, Jul 07, 2025 at 11:59:49AM +0200, Michal Koutný wrote: > Hello. > > On Tue, Jul 01, 2025 at 10:08:46PM +0900, YoungJun Park <youngjun.park@lge.com> wrote: > > memory.swap.priority > ... > > > To assign priorities to swap devices in the current cgroup, > > write one or more lines in the following format: > > > > <swap_device_unique_id> <priority> > > How would the user know this unique_id? (I don't see it in /proc/swaps.) The unique_id is a new concept I introduced to refer to assigned swap devices. It's allocated whenever a swap device is turned on. I did explore other key identifiers like the swap device path, but I determined that providing a separate unique_id is more suitable for this context. Initially, I proposed printing it directly from memory.swap.priority to facilitate usage like: $ swapon NAME TYPE SIZE USED PRIO /dev/sdb partition 300M 0B 10 /dev/sdc partition 300M 0B 5 $ cat memory.swap.priority Active /dev/sdb unique:1 prio:10 /dev/sdc unique:2 prio:5 Following your suggestion, I've deprecated this initial proposal and considered four alternatives. I'm currently leaning towards options 2 and 4, and I plan to propose option 4 as the primary approach: 1. /proc/swaps with ID: We've rejected this due to potential ABI changes. 2. New /proc interface: This could be /proc/swaps with the ID, or a dedicated swapdevice file with the ID. While viable, I prefer not to add new /proc interfaces if we can avoid it. 3. /sys/kernel/mm/swap/ location: (Similar to vma_ra_enabled) This was rejected because sysfs typically shows configured values, not dynamic identifiers, which would be inconsistent with existing conventions. 4. Align memory.swap.priority.effective with /proc/swaps: Aligning the order of id prio pairs in memory.swap.priority.effective with the output order of /proc/swaps would allow users to infer which swap device corresponds to which ID. For example: $ swapon NAME TYPE SIZE USED PRIO /dev/sdb partition 300M 0B 10 /dev/sdc partition 300M 0B 5 $ cat memory.swap.priority.effective Active 1 10 // this is /dev/sdb 2 5 // this is /dev/sdc > > Note: > > A special value of -1 means the swap device is completely > > excluded from use by this cgroup. Unlike the global swap > > priority, where negative values simply lower the priority, > > setting -1 here disables allocation from that device for the > > current cgroup only. > > The divergence from the global semantics is little bit confusing. > There should better be a special value (like 'disabled') in the interface. > And possible second special value like 'none' that denotes the default > (for new (unconfigured) cgroups or when a new swap device is activated). > Thank you for your insightful comments and suggestions regarding the default values. I was initially focused on providing numerical values for these settings. However, using keywords like "none" and "disabled" for default values makes the semantics much more natural and user-friendly. Based on your feedback and the cgroup-v2.html documentation on default values, I propose the following semantics: none: This applies priority based on the global swap priority. It's important to note that for negative priorities, this implies following NUMA auto-binding rules, rather than a direct application of the negative value itself. disabled: This keyword explicitly excludes the swap device from use by this cgroup. Here's how these semantics would translate into usage: echo "default none" > memory.swap.priority or echo "none" > memory.swap.priority: * When swapon is active, the cgroup's swap device priority will follow the global swap priority. echo "default disabled" > memory.swap.priority or echo "default" > memory.swap.priority: * When swapon is active, the swap device will be excluded from allocation within this cgroup. echo "<id> none" > memory.swap.priority: * The specified swap device will follow its global swap priority. echo "<id> disabled" > memory.swap.priority: * The specified swap device will be excluded from allocation for this cgroup. echo "<id> <prio>" > memory.swap.priority: * This sets a specific priority for the specified swap device. > ... > > In this case: > > - If no cgroup sets any configuration, the output matches the > > global `swapon` priority. > > - If an ancestor has a configuration, the child inherits it > > and ignores its own setting. > > The child's priority could be capped by ancestors' instead of wholy > overwritten? (So that remains some effect both.) Regarding the child's priority being capped or refined by ancestors' settings, I've considered allowing the child's priority to resolve its own settings when the sorted priority order is consistent and the child's swap devices are a subset of the parent's. Here's a visual representation of how that might work: +-----------------+ | Parent cgroup | | (Swaps: A, B, C)| +--------+--------+ | | (Child applies settings to its own children) v +--------+--------+ | Child cgroup | | (Swaps: B, C) | | (B & C resolved by child's settings) +--------+--------+ | +-------------------+ | | v v +--------+--------+ +--------+--------+ | Grandchild cgroup | | Grandchild 2 cgroup | | (Swaps: C) | | (Swaps: A) | | (C resolved by | | (A not in B,C; | | grandchild's | | resolved by | | child's settings)| | child's settings)| +-------------------+ +-------------------+ However, this feature isn't currently required for our immediate use case, and it adds notable complexity to the implementation. I suggest we consider this as a next step if the current feature is integrated into the kernel and sees widespread adoption or any further use cases or requirements. Best regards, Youngjun Park
On Mon, Jul 07, 2025 at 11:45:25PM +0900, YoungJun Park wrote: > $ cat memory.swap.priority.effective > Active > 1 10 // this is /dev/sdb > 2 5 // this is /dev/sdc Please disregard the "Active" line. I apologize; I mistakenly included incorrect output. $cat memory.swap.priority.effective 1 10 // this is /dev/sdb 2 5 // this is /dev/sdc Best regards, Youngjun Park
© 2016 - 2025 Red Hat, Inc.