mm/swap, memcg: Support per-cgroup swap device prioritization

[RFC PATCH 2/2] mm: swap: apply per cgroup swap priority mechansim on swap layer

Posted by youngjun.park@lge.com 4 months ago

From: "youngjun.park" <youngjun.park@lge.com>

This patch implements swap device selection and swap on/off propagation
when a cgroup-specific swap priority is set.

There is one workaround to this implementation as follows.
Current per-cpu swap cluster enforces swap device selection based solely
on CPU locality, overriding the swap cgroup's configured priorities.
Therefore, when a swap cgroup priority is assigned, we fall back to
using per-CPU clusters per swap device, similar to the previous behavior.

A proper fix for this workaround will be evaluated in the next patch.

Signed-off-by: Youngjun park <youngjun.park@lge.com>
---
 include/linux/swap.h      |   8 +++
 mm/swap.h                 |   8 +++
 mm/swap_cgroup_priority.c | 133 ++++++++++++++++++++++++++++++++++++++
 mm/swapfile.c             | 125 ++++++++++++++++++++++++-----------
 4 files changed, 238 insertions(+), 36 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 49b73911c1bd..d158b0d5c997 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -283,6 +283,13 @@ enum swap_cluster_flags {
 #define SWAP_NR_ORDERS		1
 #endif
 
+#ifdef CONFIG_SWAP_CGROUP_PRIORITY
+struct percpu_cluster {
+	local_lock_t lock; /* Protect the percpu_cluster above */
+	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
+};
+#endif
+
 /*
  * We keep using same cluster for rotational device so IO will be sequential.
  * The purpose is to optimize SWAP throughput on these device.
@@ -341,6 +348,7 @@ struct swap_info_struct {
 	struct list_head discard_clusters; /* discard clusters list */
 #ifdef CONFIG_SWAP_CGROUP_PRIORITY
 	int unique_id;
+	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
 #endif
 	struct plist_node avail_lists[]; /*
 					   * entries in swap_avail_heads, one
diff --git a/mm/swap.h b/mm/swap.h
index cd2649c632ed..cb6d653fe3f1 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -113,7 +113,15 @@ void delete_swap_cgroup_priority(struct mem_cgroup *memcg);
 void show_swap_device_unique_id(struct seq_file *m);
 #else
 static inline void delete_swap_cgroup_priority(struct mem_cgroup *memcg) {}
+static inline void activate_swap_cgroup_priority_pnode(struct swap_info_struct *swp, bool swapon) {}
+static inline void deactivate_swap_cgroup_priority_pnode(struct swap_info_struct *swp, bool swapoff){}
 static inline void get_swap_unique_id(struct swap_info_struct *si) {}
+static inline bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
+				swp_entry_t *entry, int order)
+{
+	return false;
+}
+
 #endif
 
 #else /* CONFIG_SWAP */
diff --git a/mm/swap_cgroup_priority.c b/mm/swap_cgroup_priority.c
index b3e20b676680..bb18cb251f60 100644
--- a/mm/swap_cgroup_priority.c
+++ b/mm/swap_cgroup_priority.c
@@ -54,6 +54,132 @@ static void get_swap_unique_id(struct swap_info_struct *si)
 	si->unique_id = atomic_add_return(1, &swap_unique_id_counter);
 }
 
+static bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
+				swp_entry_t *entry, int order)
+{
+	struct swap_cgroup_priority *swap_priority;
+	struct swap_cgroup_priority_pnode *pnode, *next;
+	unsigned long offset;
+	int node;
+
+	if (!memcg)
+		return false;
+
+	spin_lock(&swap_avail_lock);
+priority_check:
+	swap_priority = memcg->swap_priority;
+	if (!swap_priority) {
+		spin_unlock(&swap_avail_lock);
+		return false;
+	}
+
+	node = numa_node_id();
+start_over:
+	plist_for_each_entry_safe(pnode, next, &swap_priority->plist[node],
+					avail_lists[node]) {
+		struct swap_info_struct *si = pnode->swap;
+		plist_requeue(&pnode->avail_lists[node],
+			&swap_priority->plist[node]);
+		spin_unlock(&swap_avail_lock);
+
+		if (get_swap_device_info(si)) {
+			offset = cluster_alloc_swap_entry(si,
+					order, SWAP_HAS_CACHE, true);
+			put_swap_device(si);
+			if (offset) {
+				*entry = swp_entry(si->type, offset);
+				return true;
+			}
+			if (order)
+				return false;
+		}
+
+		spin_lock(&swap_avail_lock);
+
+		/* swap_priority is remove or changed under us. */
+		if (swap_priority != memcg->swap_priority)
+			goto priority_check;
+
+		if (plist_node_empty(&next->avail_lists[node]))
+			goto start_over;
+	}
+	spin_unlock(&swap_avail_lock);
+
+	return false;
+}
+
+/* add_to_avail_list (swapon / swapusage > 0) */
+static void activate_swap_cgroup_priority_pnode(struct swap_info_struct *swp,
+			bool swapon)
+{
+	struct swap_cgroup_priority *swap_priority;
+	int i;
+
+	list_for_each_entry(swap_priority, &swap_cgroup_priority_list, link) {
+		struct swap_cgroup_priority_pnode *pnode
+			= swap_priority->pnode[swp->type];
+
+		if (swapon) {
+			pnode->swap = swp;
+			pnode->prio = swp->prio;
+		}
+
+		/* NUMA priority handling */
+		for_each_node(i) {
+			if (swapon) {
+				if (swap_node(swp) == i) {
+					plist_node_init(
+						&pnode->avail_lists[i],
+						1);
+				} else {
+					plist_node_init(
+						&pnode->avail_lists[i],
+						-pnode->prio);
+				}
+			}
+
+			plist_add(&pnode->avail_lists[i],
+				&swap_priority->plist[i]);
+		}
+	}
+}
+
+/* del_from_avail_list (swapoff / swap usage <= 0) */
+static void deactivate_swap_cgroup_priority_pnode(struct swap_info_struct *swp,
+		bool swapoff)
+{
+	struct swap_cgroup_priority *swap_priority;
+	int nid, i;
+
+	list_for_each_entry(swap_priority, &swap_cgroup_priority_list, link) {
+		struct swap_cgroup_priority_pnode *pnode;
+
+		if (swapoff && swp->prio < 0) {
+			/*
+			* NUMA priority handling
+			* mimic swapoff prio adjustment without plist
+			*/
+			for (int i = 0; i < MAX_SWAPFILES; i++) {
+				pnode = swap_priority->pnode[i];
+				if (pnode->prio > swp->prio ||
+					pnode->swap == swp)
+					continue;
+
+				pnode->prio++;
+				for_each_node(nid) {
+					if (pnode->avail_lists[nid].prio != 1)
+						pnode->avail_lists[nid].prio--;
+				}
+			}
+		}
+
+		pnode = swap_priority->pnode[swp->type];
+		for_each_node(i)
+			plist_del(&pnode->avail_lists[i],
+				&swap_priority->plist[i]);
+	}
+}
+
 int create_swap_cgroup_priority(struct mem_cgroup *memcg,
 		int unique[], int prio[], int nr)
 {
@@ -183,6 +309,12 @@ void delete_swap_cgroup_priority(struct mem_cgroup *memcg)
 {
 	struct swap_cgroup_priority *swap_priority;
 
+	/*
+	* XXX: Possible RCU wait? No. Cannot protect priority list addition.
+	* swap_avail_lock gives protection.
+	* Think about other object protection mechanism
+	* might be solve it and better. (e.g object reference)
+	*/
 	spin_lock(&swap_avail_lock);
 	swap_priority = memcg->swap_priority;
 	if (!swap_priority) {
@@ -198,5 +330,6 @@ void delete_swap_cgroup_priority(struct mem_cgroup *memcg)
 
 	for (int i = 0; i < MAX_SWAPFILES; i++)
 		kvfree(swap_priority->pnode[i]);
+
 	kvfree(swap_priority);
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f8e48dd2381e..28afe4ec0504 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -126,8 +126,12 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
 	.offset = { SWAP_ENTRY_INVALID },
 	.lock = INIT_LOCAL_LOCK(),
 };
-/* TODO: better choice? */
+/* TODO: better arrangement */
 #ifdef CONFIG_SWAP_CGROUP_PRIORITY
+static bool get_swap_device_info(struct swap_info_struct *si);
+static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
+					      unsigned char usage, bool is_cgroup_priority);
+static int swap_node(struct swap_info_struct *si);
 #include "swap_cgroup_priority.c"
 #endif
 
@@ -776,7 +780,8 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
 					    struct swap_cluster_info *ci,
 					    unsigned long offset,
 					    unsigned int order,
-					    unsigned char usage)
+					    unsigned char usage,
+					    bool is_cgroup_priority)
 {
 	unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
 	unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER);
@@ -820,12 +825,19 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
 out:
 	relocate_cluster(si, ci);
 	unlock_cluster(ci);
+
 	if (si->flags & SWP_SOLIDSTATE) {
-		this_cpu_write(percpu_swap_cluster.offset[order], next);
-		this_cpu_write(percpu_swap_cluster.si[order], si);
-	} else {
+		if (!is_cgroup_priority) {
+			this_cpu_write(percpu_swap_cluster.offset[order], next);
+			this_cpu_write(percpu_swap_cluster.si[order], si);
+		} else {
+#ifdef CONFIG_SWAP_CGROUP_PRIORITY
+			__this_cpu_write(si->percpu_cluster->next[order], next);
+#endif
+		}
+	} else
 		si->global_cluster->next[order] = next;
-	}
+
 	return found;
 }
 
@@ -883,7 +895,7 @@ static void swap_reclaim_work(struct work_struct *work)
  * cluster for current CPU too.
  */
 static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
-					      unsigned char usage)
+					      unsigned char usage, bool is_cgroup_priority)
 {
 	struct swap_cluster_info *ci;
 	unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
@@ -895,32 +907,38 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
 	if (order && !(si->flags & SWP_BLKDEV))
 		return 0;
 
-	if (!(si->flags & SWP_SOLIDSTATE)) {
+	if (si->flags & SWP_SOLIDSTATE) {
+#ifdef CONFIG_SWAP_CGROUP_PRIORITY
+                local_lock(&si->percpu_cluster->lock);
+                offset = __this_cpu_read(si->percpu_cluster->next[order]);
+#endif
+	} else {
 		/* Serialize HDD SWAP allocation for each device. */
 		spin_lock(&si->global_cluster_lock);
 		offset = si->global_cluster->next[order];
-		if (offset == SWAP_ENTRY_INVALID)
-			goto new_cluster;
+	}
 
-		ci = lock_cluster(si, offset);
-		/* Cluster could have been used by another order */
-		if (cluster_is_usable(ci, order)) {
-			if (cluster_is_empty(ci))
-				offset = cluster_offset(si, ci);
-			found = alloc_swap_scan_cluster(si, ci, offset,
-							order, usage);
-		} else {
-			unlock_cluster(ci);
-		}
-		if (found)
-			goto done;
+	if (offset == SWAP_ENTRY_INVALID)
+		goto new_cluster;
+
+	ci = lock_cluster(si, offset);
+	/* Cluster could have been used by another order */
+	if (cluster_is_usable(ci, order)) {
+		if (cluster_is_empty(ci))
+			offset = cluster_offset(si, ci);
+		found = alloc_swap_scan_cluster(si, ci, offset,
+						order, usage, is_cgroup_priority);
+	} else {
+		unlock_cluster(ci);
 	}
+	if (found)
+		goto done;
 
 new_cluster:
 	ci = isolate_lock_cluster(si, &si->free_clusters);
 	if (ci) {
 		found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
-						order, usage);
+						order, usage, is_cgroup_priority);
 		if (found)
 			goto done;
 	}
@@ -934,7 +952,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
 
 		while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
 			found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
-							order, usage);
+							order, usage, is_cgroup_priority);
 			if (found)
 				goto done;
 			/* Clusters failed to allocate are moved to frag_clusters */
@@ -952,7 +970,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
 			 * reclaimable (eg. lazy-freed swap cache) slots.
 			 */
 			found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
-							order, usage);
+							order, usage, is_cgroup_priority);
 			if (found)
 				goto done;
 			frags++;
@@ -979,21 +997,27 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
 		while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
 			atomic_long_dec(&si->frag_cluster_nr[o]);
 			found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
-							0, usage);
+							0, usage, is_cgroup_priority);
 			if (found)
 				goto done;
 		}
 
 		while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) {
 			found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
-							0, usage);
+							0, usage, is_cgroup_priority);
 			if (found)
 				goto done;
 		}
 	}
 done:
-	if (!(si->flags & SWP_SOLIDSTATE))
+	if (si->flags & SWP_SOLIDSTATE) {
+#ifdef CONFIG_SWAP_CGROUP_PRIORITY
+		local_unlock(&si->percpu_cluster->lock);
+#endif
+	} else {
 		spin_unlock(&si->global_cluster_lock);
+	}
+
 	return found;
 }
 
@@ -1032,6 +1056,7 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
 	for_each_node(nid)
 		plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
 
+	deactivate_swap_cgroup_priority_pnode(si, swapoff);
 skip:
 	spin_unlock(&swap_avail_lock);
 }
@@ -1075,6 +1100,7 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
 	for_each_node(nid)
 		plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
 
+	activate_swap_cgroup_priority_pnode(si, swapon);
 skip:
 	spin_unlock(&swap_avail_lock);
 }
@@ -1200,7 +1226,8 @@ static bool swap_alloc_fast(swp_entry_t *entry,
 	if (cluster_is_usable(ci, order)) {
 		if (cluster_is_empty(ci))
 			offset = cluster_offset(si, ci);
-		found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE);
+		found = alloc_swap_scan_cluster(si, ci, offset, order,
+				SWAP_HAS_CACHE, false);
 		if (found)
 			*entry = swp_entry(si->type, found);
 	} else {
@@ -1227,7 +1254,7 @@ static bool swap_alloc_slow(swp_entry_t *entry,
 		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
 		spin_unlock(&swap_avail_lock);
 		if (get_swap_device_info(si)) {
-			offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
+			offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE, false);
 			put_swap_device(si);
 			if (offset) {
 				*entry = swp_entry(si->type, offset);
@@ -1294,10 +1321,12 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
 		}
 	}
 
-	local_lock(&percpu_swap_cluster.lock);
-	if (!swap_alloc_fast(&entry, order))
-		swap_alloc_slow(&entry, order);
-	local_unlock(&percpu_swap_cluster.lock);
+	if (!swap_alloc_cgroup_priority(folio_memcg(folio), &entry, order)) {
+		local_lock(&percpu_swap_cluster.lock);
+		if (!swap_alloc_fast(&entry, order))
+			swap_alloc_slow(&entry, order);
+		local_unlock(&percpu_swap_cluster.lock);
+	}
 
 	/* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
 	if (mem_cgroup_try_charge_swap(folio, entry))
@@ -1870,7 +1899,7 @@ swp_entry_t get_swap_page_of_type(int type)
 	/* This is called for allocating swap entry, not cache */
 	if (get_swap_device_info(si)) {
 		if (si->flags & SWP_WRITEOK) {
-			offset = cluster_alloc_swap_entry(si, 0, 1);
+			offset = cluster_alloc_swap_entry(si, 0, 1, false);
 			if (offset) {
 				entry = swp_entry(si->type, offset);
 				atomic_long_dec(&nr_swap_pages);
@@ -2800,6 +2829,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	arch_swap_invalidate_area(p->type);
 	zswap_swapoff(p->type);
 	mutex_unlock(&swapon_mutex);
+#ifdef CONFIG_SWAP_CGROUP_PRIORITY
+	free_percpu(p->percpu_cluster);
+	p->percpu_cluster = NULL;
+#endif
 	kfree(p->global_cluster);
 	p->global_cluster = NULL;
 	vfree(swap_map);
@@ -3207,7 +3240,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
 	for (i = 0; i < nr_clusters; i++)
 		spin_lock_init(&cluster_info[i].lock);
 
-	if (!(si->flags & SWP_SOLIDSTATE)) {
+	if (si->flags & SWP_SOLIDSTATE) {
+#ifdef CONFIG_SWAP_CGROUP_PRIORITY
+		si->percpu_cluster = alloc_percpu(struct percpu_cluster);
+		if (!si->percpu_cluster)
+			goto err_free;
+
+		int cpu;
+		for_each_possible_cpu(cpu) {
+			struct percpu_cluster *cluster;
+
+			cluster = per_cpu_ptr(si->percpu_cluster, cpu);
+			for (i = 0; i < SWAP_NR_ORDERS; i++)
+				cluster->next[i] = SWAP_ENTRY_INVALID;
+			local_lock_init(&cluster->lock);
+		}
+#endif
+	} else {
 		si->global_cluster = kmalloc(sizeof(*si->global_cluster),
 				     GFP_KERNEL);
 		if (!si->global_cluster)
@@ -3495,6 +3544,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 bad_swap_unlock_inode:
 	inode_unlock(inode);
 bad_swap:
+#ifdef CONFIG_SWAP_CGROUP_PRIORITY
+	free_percpu(si->percpu_cluster);
+	si->percpu_cluster = NULL;
+#endif
 	kfree(si->global_cluster);
 	si->global_cluster = NULL;
 	inode = NULL;
-- 
2.34.1

Re: [RFC PATCH 2/2] mm: swap: apply per cgroup swap priority mechansim on swap layer

Posted by Kairui Song 4 months ago

On Thu, Jun 12, 2025 at 6:43 PM <youngjun.park@lge.com> wrote:
>
> From: "youngjun.park" <youngjun.park@lge.com>
>

Hi, Youngjun,

Thanks for sharing this series.

> This patch implements swap device selection and swap on/off propagation
> when a cgroup-specific swap priority is set.
>
> There is one workaround to this implementation as follows.
> Current per-cpu swap cluster enforces swap device selection based solely
> on CPU locality, overriding the swap cgroup's configured priorities.

I've been thinking about this, we can switch to a per-cgroup-per-cpu
next cluster selector, the problem with current code is that swap
allocator is not designed with folio / cgroup in mind at all, so it's
really ugly to implement, which is why I have following two patches in
the swap table series:

https://lore.kernel.org/linux-mm/20250514201729.48420-18-ryncsn@gmail.com/
https://lore.kernel.org/linux-mm/20250514201729.48420-22-ryncsn@gmail.com/

The first one makes all swap allocation starts with a folio, the
second one makes the allocator always folio aware. So you can know
which cgroup is doing the allocation at anytime inside the allocator
(and it reduced the number of argument, also improving performance :)
)

So the allocator can just use cgroup's swap info if available, plist,
percpu cluster, and fallback to global locality in a very natural way.

> Therefore, when a swap cgroup priority is assigned, we fall back to
> using per-CPU clusters per swap device, similar to the previous behavior.
>
> A proper fix for this workaround will be evaluated in the next patch.

Hmm, but this is already the last patch in the series?

>
> Signed-off-by: Youngjun park <youngjun.park@lge.com>
> ---
>  include/linux/swap.h      |   8 +++
>  mm/swap.h                 |   8 +++
>  mm/swap_cgroup_priority.c | 133 ++++++++++++++++++++++++++++++++++++++
>  mm/swapfile.c             | 125 ++++++++++++++++++++++++-----------
>  4 files changed, 238 insertions(+), 36 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 49b73911c1bd..d158b0d5c997 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -283,6 +283,13 @@ enum swap_cluster_flags {
>  #define SWAP_NR_ORDERS         1
>  #endif
>
> +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> +struct percpu_cluster {
> +       local_lock_t lock; /* Protect the percpu_cluster above */
> +       unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
> +};
> +#endif
> +
>  /*
>   * We keep using same cluster for rotational device so IO will be sequential.
>   * The purpose is to optimize SWAP throughput on these device.
> @@ -341,6 +348,7 @@ struct swap_info_struct {
>         struct list_head discard_clusters; /* discard clusters list */
>  #ifdef CONFIG_SWAP_CGROUP_PRIORITY
>         int unique_id;
> +       struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
>  #endif
>         struct plist_node avail_lists[]; /*
>                                            * entries in swap_avail_heads, one
> diff --git a/mm/swap.h b/mm/swap.h
> index cd2649c632ed..cb6d653fe3f1 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -113,7 +113,15 @@ void delete_swap_cgroup_priority(struct mem_cgroup *memcg);
>  void show_swap_device_unique_id(struct seq_file *m);
>  #else
>  static inline void delete_swap_cgroup_priority(struct mem_cgroup *memcg) {}
> +static inline void activate_swap_cgroup_priority_pnode(struct swap_info_struct *swp, bool swapon) {}
> +static inline void deactivate_swap_cgroup_priority_pnode(struct swap_info_struct *swp, bool swapoff){}
>  static inline void get_swap_unique_id(struct swap_info_struct *si) {}
> +static inline bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
> +                               swp_entry_t *entry, int order)
> +{
> +       return false;
> +}
> +
>  #endif
>
>  #else /* CONFIG_SWAP */
> diff --git a/mm/swap_cgroup_priority.c b/mm/swap_cgroup_priority.c
> index b3e20b676680..bb18cb251f60 100644
> --- a/mm/swap_cgroup_priority.c
> +++ b/mm/swap_cgroup_priority.c
> @@ -54,6 +54,132 @@ static void get_swap_unique_id(struct swap_info_struct *si)
>         si->unique_id = atomic_add_return(1, &swap_unique_id_counter);
>  }
>
> +static bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
> +                               swp_entry_t *entry, int order)
> +{
> +       struct swap_cgroup_priority *swap_priority;
> +       struct swap_cgroup_priority_pnode *pnode, *next;
> +       unsigned long offset;
> +       int node;
> +
> +       if (!memcg)
> +               return false;
> +
> +       spin_lock(&swap_avail_lock);
> +priority_check:
> +       swap_priority = memcg->swap_priority;
> +       if (!swap_priority) {
> +               spin_unlock(&swap_avail_lock);
> +               return false;
> +       }
> +
> +       node = numa_node_id();
> +start_over:
> +       plist_for_each_entry_safe(pnode, next, &swap_priority->plist[node],
> +                                       avail_lists[node]) {
> +               struct swap_info_struct *si = pnode->swap;
> +               plist_requeue(&pnode->avail_lists[node],
> +                       &swap_priority->plist[node]);
> +               spin_unlock(&swap_avail_lock);
> +
> +               if (get_swap_device_info(si)) {
> +                       offset = cluster_alloc_swap_entry(si,
> +                                       order, SWAP_HAS_CACHE, true);
> +                       put_swap_device(si);
> +                       if (offset) {
> +                               *entry = swp_entry(si->type, offset);
> +                               return true;
> +                       }
> +                       if (order)
> +                               return false;
> +               }
> +
> +               spin_lock(&swap_avail_lock);
> +
> +               /* swap_priority is remove or changed under us. */
> +               if (swap_priority != memcg->swap_priority)
> +                       goto priority_check;
> +
> +               if (plist_node_empty(&next->avail_lists[node]))
> +                       goto start_over;
> +       }
> +       spin_unlock(&swap_avail_lock);
> +
> +       return false;
> +}
> +
> +/* add_to_avail_list (swapon / swapusage > 0) */
> +static void activate_swap_cgroup_priority_pnode(struct swap_info_struct *swp,
> +                       bool swapon)
> +{
> +       struct swap_cgroup_priority *swap_priority;
> +       int i;
> +
> +       list_for_each_entry(swap_priority, &swap_cgroup_priority_list, link) {
> +               struct swap_cgroup_priority_pnode *pnode
> +                       = swap_priority->pnode[swp->type];
> +
> +               if (swapon) {
> +                       pnode->swap = swp;
> +                       pnode->prio = swp->prio;
> +               }
> +
> +               /* NUMA priority handling */
> +               for_each_node(i) {
> +                       if (swapon) {
> +                               if (swap_node(swp) == i) {
> +                                       plist_node_init(
> +                                               &pnode->avail_lists[i],
> +                                               1);
> +                               } else {
> +                                       plist_node_init(
> +                                               &pnode->avail_lists[i],
> +                                               -pnode->prio);
> +                               }
> +                       }
> +
> +                       plist_add(&pnode->avail_lists[i],
> +                               &swap_priority->plist[i]);
> +               }
> +       }
> +}
> +
> +/* del_from_avail_list (swapoff / swap usage <= 0) */
> +static void deactivate_swap_cgroup_priority_pnode(struct swap_info_struct *swp,
> +               bool swapoff)
> +{
> +       struct swap_cgroup_priority *swap_priority;
> +       int nid, i;
> +
> +       list_for_each_entry(swap_priority, &swap_cgroup_priority_list, link) {
> +               struct swap_cgroup_priority_pnode *pnode;
> +
> +               if (swapoff && swp->prio < 0) {
> +                       /*
> +                       * NUMA priority handling
> +                       * mimic swapoff prio adjustment without plist
> +                       */
> +                       for (int i = 0; i < MAX_SWAPFILES; i++) {
> +                               pnode = swap_priority->pnode[i];
> +                               if (pnode->prio > swp->prio ||
> +                                       pnode->swap == swp)
> +                                       continue;
> +
> +                               pnode->prio++;
> +                               for_each_node(nid) {
> +                                       if (pnode->avail_lists[nid].prio != 1)
> +                                               pnode->avail_lists[nid].prio--;
> +                               }
> +                       }
> +               }
> +
> +               pnode = swap_priority->pnode[swp->type];
> +               for_each_node(i)
> +                       plist_del(&pnode->avail_lists[i],
> +                               &swap_priority->plist[i]);
> +       }
> +}
> +
>  int create_swap_cgroup_priority(struct mem_cgroup *memcg,
>                 int unique[], int prio[], int nr)
>  {
> @@ -183,6 +309,12 @@ void delete_swap_cgroup_priority(struct mem_cgroup *memcg)
>  {
>         struct swap_cgroup_priority *swap_priority;
>
> +       /*
> +       * XXX: Possible RCU wait? No. Cannot protect priority list addition.
> +       * swap_avail_lock gives protection.
> +       * Think about other object protection mechanism
> +       * might be solve it and better. (e.g object reference)
> +       */
>         spin_lock(&swap_avail_lock);
>         swap_priority = memcg->swap_priority;
>         if (!swap_priority) {
> @@ -198,5 +330,6 @@ void delete_swap_cgroup_priority(struct mem_cgroup *memcg)
>
>         for (int i = 0; i < MAX_SWAPFILES; i++)
>                 kvfree(swap_priority->pnode[i]);
> +
>         kvfree(swap_priority);
>  }
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index f8e48dd2381e..28afe4ec0504 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -126,8 +126,12 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
>         .offset = { SWAP_ENTRY_INVALID },
>         .lock = INIT_LOCAL_LOCK(),
>  };
> -/* TODO: better choice? */
> +/* TODO: better arrangement */
>  #ifdef CONFIG_SWAP_CGROUP_PRIORITY
> +static bool get_swap_device_info(struct swap_info_struct *si);
> +static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
> +                                             unsigned char usage, bool is_cgroup_priority);
> +static int swap_node(struct swap_info_struct *si);
>  #include "swap_cgroup_priority.c"
>  #endif
>
> @@ -776,7 +780,8 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
>                                             struct swap_cluster_info *ci,
>                                             unsigned long offset,
>                                             unsigned int order,
> -                                           unsigned char usage)
> +                                           unsigned char usage,
> +                                           bool is_cgroup_priority)
>  {
>         unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
>         unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER);
> @@ -820,12 +825,19 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
>  out:
>         relocate_cluster(si, ci);
>         unlock_cluster(ci);
> +
>         if (si->flags & SWP_SOLIDSTATE) {
> -               this_cpu_write(percpu_swap_cluster.offset[order], next);
> -               this_cpu_write(percpu_swap_cluster.si[order], si);
> -       } else {
> +               if (!is_cgroup_priority) {
> +                       this_cpu_write(percpu_swap_cluster.offset[order], next);
> +                       this_cpu_write(percpu_swap_cluster.si[order], si);
> +               } else {
> +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> +                       __this_cpu_write(si->percpu_cluster->next[order], next);
> +#endif
> +               }
> +       } else
>                 si->global_cluster->next[order] = next;
> -       }
> +
>         return found;
>  }
>
> @@ -883,7 +895,7 @@ static void swap_reclaim_work(struct work_struct *work)
>   * cluster for current CPU too.
>   */
>  static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
> -                                             unsigned char usage)
> +                                             unsigned char usage, bool is_cgroup_priority)
>  {
>         struct swap_cluster_info *ci;
>         unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
> @@ -895,32 +907,38 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
>         if (order && !(si->flags & SWP_BLKDEV))
>                 return 0;
>
> -       if (!(si->flags & SWP_SOLIDSTATE)) {
> +       if (si->flags & SWP_SOLIDSTATE) {
> +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> +                local_lock(&si->percpu_cluster->lock);
> +                offset = __this_cpu_read(si->percpu_cluster->next[order]);
> +#endif
> +       } else {
>                 /* Serialize HDD SWAP allocation for each device. */
>                 spin_lock(&si->global_cluster_lock);
>                 offset = si->global_cluster->next[order];
> -               if (offset == SWAP_ENTRY_INVALID)
> -                       goto new_cluster;
> +       }
>
> -               ci = lock_cluster(si, offset);
> -               /* Cluster could have been used by another order */
> -               if (cluster_is_usable(ci, order)) {
> -                       if (cluster_is_empty(ci))
> -                               offset = cluster_offset(si, ci);
> -                       found = alloc_swap_scan_cluster(si, ci, offset,
> -                                                       order, usage);
> -               } else {
> -                       unlock_cluster(ci);
> -               }
> -               if (found)
> -                       goto done;
> +       if (offset == SWAP_ENTRY_INVALID)
> +               goto new_cluster;
> +
> +       ci = lock_cluster(si, offset);
> +       /* Cluster could have been used by another order */
> +       if (cluster_is_usable(ci, order)) {
> +               if (cluster_is_empty(ci))
> +                       offset = cluster_offset(si, ci);
> +               found = alloc_swap_scan_cluster(si, ci, offset,
> +                                               order, usage, is_cgroup_priority);
> +       } else {
> +               unlock_cluster(ci);
>         }
> +       if (found)
> +               goto done;
>
>  new_cluster:
>         ci = isolate_lock_cluster(si, &si->free_clusters);
>         if (ci) {
>                 found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> -                                               order, usage);
> +                                               order, usage, is_cgroup_priority);
>                 if (found)
>                         goto done;
>         }
> @@ -934,7 +952,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
>
>                 while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
>                         found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> -                                                       order, usage);
> +                                                       order, usage, is_cgroup_priority);
>                         if (found)
>                                 goto done;
>                         /* Clusters failed to allocate are moved to frag_clusters */
> @@ -952,7 +970,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
>                          * reclaimable (eg. lazy-freed swap cache) slots.
>                          */
>                         found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> -                                                       order, usage);
> +                                                       order, usage, is_cgroup_priority);
>                         if (found)
>                                 goto done;
>                         frags++;
> @@ -979,21 +997,27 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
>                 while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
>                         atomic_long_dec(&si->frag_cluster_nr[o]);
>                         found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> -                                                       0, usage);
> +                                                       0, usage, is_cgroup_priority);
>                         if (found)
>                                 goto done;
>                 }
>
>                 while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) {
>                         found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> -                                                       0, usage);
> +                                                       0, usage, is_cgroup_priority);
>                         if (found)
>                                 goto done;
>                 }
>         }
>  done:
> -       if (!(si->flags & SWP_SOLIDSTATE))
> +       if (si->flags & SWP_SOLIDSTATE) {
> +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> +               local_unlock(&si->percpu_cluster->lock);
> +#endif
> +       } else {
>                 spin_unlock(&si->global_cluster_lock);
> +       }
> +
>         return found;
>  }
>
> @@ -1032,6 +1056,7 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
>         for_each_node(nid)
>                 plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
>
> +       deactivate_swap_cgroup_priority_pnode(si, swapoff);
>  skip:
>         spin_unlock(&swap_avail_lock);
>  }
> @@ -1075,6 +1100,7 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
>         for_each_node(nid)
>                 plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
>
> +       activate_swap_cgroup_priority_pnode(si, swapon);
>  skip:
>         spin_unlock(&swap_avail_lock);
>  }
> @@ -1200,7 +1226,8 @@ static bool swap_alloc_fast(swp_entry_t *entry,
>         if (cluster_is_usable(ci, order)) {
>                 if (cluster_is_empty(ci))
>                         offset = cluster_offset(si, ci);
> -               found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE);
> +               found = alloc_swap_scan_cluster(si, ci, offset, order,
> +                               SWAP_HAS_CACHE, false);
>                 if (found)
>                         *entry = swp_entry(si->type, found);
>         } else {
> @@ -1227,7 +1254,7 @@ static bool swap_alloc_slow(swp_entry_t *entry,
>                 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
>                 spin_unlock(&swap_avail_lock);
>                 if (get_swap_device_info(si)) {
> -                       offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
> +                       offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE, false);
>                         put_swap_device(si);
>                         if (offset) {
>                                 *entry = swp_entry(si->type, offset);
> @@ -1294,10 +1321,12 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
>                 }
>         }
>
> -       local_lock(&percpu_swap_cluster.lock);
> -       if (!swap_alloc_fast(&entry, order))
> -               swap_alloc_slow(&entry, order);
> -       local_unlock(&percpu_swap_cluster.lock);
> +       if (!swap_alloc_cgroup_priority(folio_memcg(folio), &entry, order)) {
> +               local_lock(&percpu_swap_cluster.lock);
> +               if (!swap_alloc_fast(&entry, order))
> +                       swap_alloc_slow(&entry, order);
> +               local_unlock(&percpu_swap_cluster.lock);
> +       }
>
>         /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
>         if (mem_cgroup_try_charge_swap(folio, entry))
> @@ -1870,7 +1899,7 @@ swp_entry_t get_swap_page_of_type(int type)
>         /* This is called for allocating swap entry, not cache */
>         if (get_swap_device_info(si)) {
>                 if (si->flags & SWP_WRITEOK) {
> -                       offset = cluster_alloc_swap_entry(si, 0, 1);
> +                       offset = cluster_alloc_swap_entry(si, 0, 1, false);
>                         if (offset) {
>                                 entry = swp_entry(si->type, offset);
>                                 atomic_long_dec(&nr_swap_pages);
> @@ -2800,6 +2829,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>         arch_swap_invalidate_area(p->type);
>         zswap_swapoff(p->type);
>         mutex_unlock(&swapon_mutex);
> +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> +       free_percpu(p->percpu_cluster);
> +       p->percpu_cluster = NULL;
> +#endif
>         kfree(p->global_cluster);
>         p->global_cluster = NULL;
>         vfree(swap_map);
> @@ -3207,7 +3240,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
>         for (i = 0; i < nr_clusters; i++)
>                 spin_lock_init(&cluster_info[i].lock);
>
> -       if (!(si->flags & SWP_SOLIDSTATE)) {
> +       if (si->flags & SWP_SOLIDSTATE) {
> +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> +               si->percpu_cluster = alloc_percpu(struct percpu_cluster);
> +               if (!si->percpu_cluster)
> +                       goto err_free;
> +
> +               int cpu;
> +               for_each_possible_cpu(cpu) {
> +                       struct percpu_cluster *cluster;
> +
> +                       cluster = per_cpu_ptr(si->percpu_cluster, cpu);
> +                       for (i = 0; i < SWAP_NR_ORDERS; i++)
> +                               cluster->next[i] = SWAP_ENTRY_INVALID;
> +                       local_lock_init(&cluster->lock);
> +               }
> +#endif
> +       } else {
>                 si->global_cluster = kmalloc(sizeof(*si->global_cluster),
>                                      GFP_KERNEL);
>                 if (!si->global_cluster)
> @@ -3495,6 +3544,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>  bad_swap_unlock_inode:
>         inode_unlock(inode);
>  bad_swap:
> +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> +       free_percpu(si->percpu_cluster);
> +       si->percpu_cluster = NULL;
> +#endif
>         kfree(si->global_cluster);
>         si->global_cluster = NULL;
>         inode = NULL;
> --
> 2.34.1
>
>

Re: [RFC PATCH 2/2] mm: swap: apply per cgroup swap priority mechansim on swap layer

Posted by YoungJun Park 3 months, 4 weeks ago

On Thu, Jun 12, 2025 at 07:14:20PM +0800, Kairui Song wrote:
> On Thu, Jun 12, 2025 at 6:43 PM <youngjun.park@lge.com> wrote:
> >
> > From: "youngjun.park" <youngjun.park@lge.com>
> >
> 
> Hi, Youngjun,
> 
> Thanks for sharing this series.
> 
> > This patch implements swap device selection and swap on/off propagation
> > when a cgroup-specific swap priority is set.
> >
> > There is one workaround to this implementation as follows.
> > Current per-cpu swap cluster enforces swap device selection based solely
> > on CPU locality, overriding the swap cgroup's configured priorities.
> 
> I've been thinking about this, we can switch to a per-cgroup-per-cpu
> next cluster selector, the problem with current code is that swap
> allocator is not designed with folio / cgroup in mind at all, so it's
> really ugly to implement, which is why I have following two patches in
> the swap table series:
 
This seems to be the suitable alternative for upstream at the moment.
I think there are still a few things that need to be considered, though.         
(Nhat pointed it out well. I've share my thoughts on that context. )  

> https://lore.kernel.org/linux-mm/20250514201729.48420-18-ryncsn@gmail.com/
> https://lore.kernel.org/linux-mm/20250514201729.48420-22-ryncsn@gmail.com/
> 
> The first one makes all swap allocation starts with a folio, the
> second one makes the allocator always folio aware. So you can know
> which cgroup is doing the allocation at anytime inside the allocator
> (and it reduced the number of argument, also improving performance :)
> )
> So the allocator can just use cgroup's swap info if available, plist,
> percpu cluster, and fallback to global locality in a very natural way.
> 

Wow! This is exactly the situation I needed. 
I thought it was uncomfortable to pass memcg parameter.
If memcg can be naturally identified within the allocation, as you mentioned,    
It would be good both performance-wise and design-wise. 

> > Therefore, when a swap cgroup priority is assigned, we fall back to
> > using per-CPU clusters per swap device, similar to the previous behavior.
> >
> > A proper fix for this workaround will be evaluated in the next patch.
> 
> Hmm, but this is already the last patch in the series?

Ah! The next patch series refers to the one.
I'm still evaluating this part and wasn't confident enough to include it
in the current version.
At first, I wanted to get feedback on the core part, I'm currently pursuing.

Re: [RFC PATCH 2/2] mm: swap: apply per cgroup swap priority mechansim on swap layer

Posted by Nhat Pham 4 months ago

On Thu, Jun 12, 2025 at 4:14 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> On Thu, Jun 12, 2025 at 6:43 PM <youngjun.park@lge.com> wrote:
> >
> > From: "youngjun.park" <youngjun.park@lge.com>
> >
>
> Hi, Youngjun,
>
> Thanks for sharing this series.
>
> > This patch implements swap device selection and swap on/off propagation
> > when a cgroup-specific swap priority is set.
> >
> > There is one workaround to this implementation as follows.
> > Current per-cpu swap cluster enforces swap device selection based solely
> > on CPU locality, overriding the swap cgroup's configured priorities.
>
> I've been thinking about this, we can switch to a per-cgroup-per-cpu
> next cluster selector, the problem with current code is that swap

What about per-cpu-per-order-per-swap-device :-? Number of swap
devices is gonna be smaller than number of cgroups, right?

At swap slot allocation time, we check the folio's swap device
priority list, then pump that all the way to the swap allocator.

swap allocator, given a priority list, for each priority level, try to
allocate from that level first. It will get a cluster (either locally
cached or a new one) from swap devices in that priority level, before
moving on to the next priority level.

Re: [RFC PATCH 2/2] mm: swap: apply per cgroup swap priority mechansim on swap layer

Posted by Kairui Song 4 months ago

On Fri, Jun 13, 2025 at 1:28 AM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Thu, Jun 12, 2025 at 4:14 AM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > On Thu, Jun 12, 2025 at 6:43 PM <youngjun.park@lge.com> wrote:
> > >
> > > From: "youngjun.park" <youngjun.park@lge.com>
> > >
> >
> > Hi, Youngjun,
> >
> > Thanks for sharing this series.
> >
> > > This patch implements swap device selection and swap on/off propagation
> > > when a cgroup-specific swap priority is set.
> > >
> > > There is one workaround to this implementation as follows.
> > > Current per-cpu swap cluster enforces swap device selection based solely
> > > on CPU locality, overriding the swap cgroup's configured priorities.
> >
> > I've been thinking about this, we can switch to a per-cgroup-per-cpu
> > next cluster selector, the problem with current code is that swap
>
> What about per-cpu-per-order-per-swap-device :-? Number of swap
> devices is gonna be smaller than number of cgroups, right?

Hi Nhat,

The problem is per cgroup makes more sense (I was suggested to use
cgroup level locality at the very beginning of the implementation of
the allocator in the mail list, but it was hard to do so at that
time), for container environments, a cgroup is a container that runs
one type of workload, so it has its own locality. Things like systemd
also organize different desktop workloads into cgroups. The whole
point is about cgroup.

There could be a lot of cgroups indeed, but not every one of them is
going to enable a cgroup level swap configuration. Youngjun used a
pointer in mem_cgroup, so disabled cgroups have no overhead.

We had a per-device-per-cpu-per-order table previously (before
1b7e90020eb77). It works. Only minor problem is allocation has to
iterate the plist first, then use the si->percpu, and usually there
are only a few swap devices, much less flexible than cgroups.

>
> At swap slot allocation time, we check the folio's swap device
> priority list, then pump that all the way to the swap allocator.
>
> swap allocator, given a priority list, for each priority level, try to
> allocate from that level first. It will get a cluster (either locally
> cached or a new one) from swap devices in that priority level, before
> moving on to the next priority level.

Re: [RFC PATCH 2/2] mm: swap: apply per cgroup swap priority mechansim on swap layer

Posted by Nhat Pham 4 months ago

On Thu, Jun 12, 2025 at 11:20 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> On Fri, Jun 13, 2025 at 1:28 AM Nhat Pham <nphamcs@gmail.com> wrote:
> >
> > On Thu, Jun 12, 2025 at 4:14 AM Kairui Song <ryncsn@gmail.com> wrote:
> > >
> > > On Thu, Jun 12, 2025 at 6:43 PM <youngjun.park@lge.com> wrote:
> > > >
> > > > From: "youngjun.park" <youngjun.park@lge.com>
> > > >
> > >
> > > Hi, Youngjun,
> > >
> > > Thanks for sharing this series.
> > >
> > > > This patch implements swap device selection and swap on/off propagation
> > > > when a cgroup-specific swap priority is set.
> > > >
> > > > There is one workaround to this implementation as follows.
> > > > Current per-cpu swap cluster enforces swap device selection based solely
> > > > on CPU locality, overriding the swap cgroup's configured priorities.
> > >
> > > I've been thinking about this, we can switch to a per-cgroup-per-cpu
> > > next cluster selector, the problem with current code is that swap
> >
> > What about per-cpu-per-order-per-swap-device :-? Number of swap
> > devices is gonna be smaller than number of cgroups, right?
>
> Hi Nhat,
>
> The problem is per cgroup makes more sense (I was suggested to use
> cgroup level locality at the very beginning of the implementation of
> the allocator in the mail list, but it was hard to do so at that
> time), for container environments, a cgroup is a container that runs
> one type of workload, so it has its own locality. Things like systemd
> also organize different desktop workloads into cgroups. The whole
> point is about cgroup.

Yeah I know what cgroup represents. Which is why I mentioned in the
next paragraph that are still making decisions based per-cgroup - we
just organize the per-cpu cache based on swap devices. This way, two
cgroups with similar/same priority list can share the clusters, for
each swapfile, in each CPU. There will be a lot less duplication and
overhead. And two cgroups with different priority lists won't
interfere with each other, since they'll target different swapfiles.

Unless we want to nudge the swapfiles/clusters to be self-partitioned
among the cgroups? :) IOW, each cluster contains pages mostly from a
single cgroup (with some stranglers mixed in). I suppose that will be
very useful for swap on rotational drives where read contiguity is
imperative, but not sure about other backends :-?

Anyway, no strong opinions to be completely honest :) Was just
throwing out some ideas. Per-cgroup-per-cpu-per-order sounds good to
me too, if it's easy to do.

Re: [RFC PATCH 2/2] mm: swap: apply per cgroup swap priority mechansim on swap layer

Posted by YoungJun Park 3 months, 4 weeks ago

On Thu, Jun 12, 2025 at 01:08:08PM -0700, Nhat Pham wrote:
> On Thu, Jun 12, 2025 at 11:20 AM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > On Fri, Jun 13, 2025 at 1:28 AM Nhat Pham <nphamcs@gmail.com> wrote:
> > >
> > > On Thu, Jun 12, 2025 at 4:14 AM Kairui Song <ryncsn@gmail.com> wrote:
> > > >
> > > > On Thu, Jun 12, 2025 at 6:43 PM <youngjun.park@lge.com> wrote:
> > > > >
> > > > > From: "youngjun.park" <youngjun.park@lge.com>
> > > > >
> > > >
> > > > Hi, Youngjun,
> > > >
> > > > Thanks for sharing this series.
> > > >
> > > > > This patch implements swap device selection and swap on/off propagation
> > > > > when a cgroup-specific swap priority is set.
> > > > >
> > > > > There is one workaround to this implementation as follows.
> > > > > Current per-cpu swap cluster enforces swap device selection based solely
> > > > > on CPU locality, overriding the swap cgroup's configured priorities.
> > > >
> > > > I've been thinking about this, we can switch to a per-cgroup-per-cpu
> > > > next cluster selector, the problem with current code is that swap
> > >
> > > What about per-cpu-per-order-per-swap-device :-? Number of swap
> > > devices is gonna be smaller than number of cgroups, right?
> >
> > Hi Nhat,
> >
> > The problem is per cgroup makes more sense (I was suggested to use
> > cgroup level locality at the very beginning of the implementation of
> > the allocator in the mail list, but it was hard to do so at that
> > time), for container environments, a cgroup is a container that runs
> > one type of workload, so it has its own locality. Things like systemd
> > also organize different desktop workloads into cgroups. The whole
> > point is about cgroup.
> 
> Yeah I know what cgroup represents. Which is why I mentioned in the
> next paragraph that are still making decisions based per-cgroup - we
> just organize the per-cpu cache based on swap devices. This way, two
> cgroups with similar/same priority list can share the clusters, for
> each swapfile, in each CPU. There will be a lot less duplication and
> overhead. And two cgroups with different priority lists won't
> interfere with each other, since they'll target different swapfiles.
> 
> Unless we want to nudge the swapfiles/clusters to be self-partitioned
> among the cgroups? :) IOW, each cluster contains pages mostly from a
> single cgroup (with some stranglers mixed in). I suppose that will be
> very useful for swap on rotational drives where read contiguity is
> imperative, but not sure about other backends :-? 
> Anyway, no strong opinions to be completely honest :) Was just
> throwing out some ideas. Per-cgroup-per-cpu-per-order sounds good to
> me too, if it's easy to do.

Good point!
I agree with the mention that self-partitioned clusters and duplicated priority.
One concern is the cost of synchronization.
Specifically the one incurred when accessing the prioritized swap device
From a simple performance perspective, a per-cgroup-per-CPU implementation
seems favorable - in line with the current swap allocation fastpath.

It seems most reasonable to carefully compare the pros and cons of the           
tow approaches.

To summaraize,

Option 1. per-cgroup-per-cpu
Pros: upstream fit. performance. 
Cons: duplicate priority(some memory structure consumtion cost), 
self partioned cluster 

Option 2. per-cpu-per-order(per-device)
Pros: Cons of Option1
Cons: Pros of Option1

It's not easy to draw a definitive conclusion right away, 
I should also evaluate other pros and cons that may arise during actual 
implementation.
so I'd like to take some time to review things in more detail 
and share my thoughs and conclusions in the next patch series.

What do you think, Nhat and Kairui?

Re: [RFC PATCH 2/2] mm: swap: apply per cgroup swap priority mechansim on swap layer

Posted by Kairui Song 3 months, 4 weeks ago

On Fri, Jun 13, 2025 at 3:11 PM YoungJun Park <youngjun.park@lge.com> wrote:
>
> On Thu, Jun 12, 2025 at 01:08:08PM -0700, Nhat Pham wrote:
> > On Thu, Jun 12, 2025 at 11:20 AM Kairui Song <ryncsn@gmail.com> wrote:
> > >
> > > On Fri, Jun 13, 2025 at 1:28 AM Nhat Pham <nphamcs@gmail.com> wrote:
> > > >
> > > > On Thu, Jun 12, 2025 at 4:14 AM Kairui Song <ryncsn@gmail.com> wrote:
> > > > >
> > > > > On Thu, Jun 12, 2025 at 6:43 PM <youngjun.park@lge.com> wrote:
> > > > > >
> > > > > > From: "youngjun.park" <youngjun.park@lge.com>
> > > > > >
> > > > >
> > > > > Hi, Youngjun,
> > > > >
> > > > > Thanks for sharing this series.
> > > > >
> > > > > > This patch implements swap device selection and swap on/off propagation
> > > > > > when a cgroup-specific swap priority is set.
> > > > > >
> > > > > > There is one workaround to this implementation as follows.
> > > > > > Current per-cpu swap cluster enforces swap device selection based solely
> > > > > > on CPU locality, overriding the swap cgroup's configured priorities.
> > > > >
> > > > > I've been thinking about this, we can switch to a per-cgroup-per-cpu
> > > > > next cluster selector, the problem with current code is that swap
> > > >
> > > > What about per-cpu-per-order-per-swap-device :-? Number of swap
> > > > devices is gonna be smaller than number of cgroups, right?
> > >
> > > Hi Nhat,
> > >
> > > The problem is per cgroup makes more sense (I was suggested to use
> > > cgroup level locality at the very beginning of the implementation of
> > > the allocator in the mail list, but it was hard to do so at that
> > > time), for container environments, a cgroup is a container that runs
> > > one type of workload, so it has its own locality. Things like systemd
> > > also organize different desktop workloads into cgroups. The whole
> > > point is about cgroup.
> >
> > Yeah I know what cgroup represents. Which is why I mentioned in the
> > next paragraph that are still making decisions based per-cgroup - we
> > just organize the per-cpu cache based on swap devices. This way, two
> > cgroups with similar/same priority list can share the clusters, for
> > each swapfile, in each CPU. There will be a lot less duplication and
> > overhead. And two cgroups with different priority lists won't
> > interfere with each other, since they'll target different swapfiles.
> >
> > Unless we want to nudge the swapfiles/clusters to be self-partitioned
> > among the cgroups? :) IOW, each cluster contains pages mostly from a
> > single cgroup (with some stranglers mixed in). I suppose that will be
> > very useful for swap on rotational drives where read contiguity is
> > imperative, but not sure about other backends :-?
> > Anyway, no strong opinions to be completely honest :) Was just
> > throwing out some ideas. Per-cgroup-per-cpu-per-order sounds good to
> > me too, if it's easy to do.
>
> Good point!
> I agree with the mention that self-partitioned clusters and duplicated priority.
> One concern is the cost of synchronization.
> Specifically the one incurred when accessing the prioritized swap device
> From a simple performance perspective, a per-cgroup-per-CPU implementation
> seems favorable - in line with the current swap allocation fastpath.
>
> It seems most reasonable to carefully compare the pros and cons of the
> tow approaches.
>
> To summaraize,
>
> Option 1. per-cgroup-per-cpu
> Pros: upstream fit. performance.
> Cons: duplicate priority(some memory structure consumtion cost),
> self partioned cluster
>
> Option 2. per-cpu-per-order(per-device)
> Pros: Cons of Option1
> Cons: Pros of Option1
>
> It's not easy to draw a definitive conclusion right away,
> I should also evaluate other pros and cons that may arise during actual
> implementation.
> so I'd like to take some time to review things in more detail
> and share my thoughs and conclusions in the next patch series.
>
> What do you think, Nhat and Kairui?

Ah, I think what might be best fits here is, each cgroup have a pcp
device list,  and each device have a pcp cluster list:

folio -> mem_cgroup -> swap_priority (maybe a more generic name is
better?) -> swap_device_pcp (recording only the *si per order)
swap_device_info -> swap_cluster_pcp (cluster offset per order)

And if mem_cgroup -> swap_priority is NULL, fallback to a global
swap_device_pcp.

This seems to fit what Nhat suggested, and easy to implement, since
both si and folio->memcg are accessible easily.

Re: [RFC PATCH 2/2] mm: swap: apply per cgroup swap priority mechansim on swap layer

Posted by Kairui Song 3 months, 4 weeks ago

On Fri, Jun 13, 2025 at 3:36 PM Kairui Song <ryncsn@gmail.com> wrote:
>
> On Fri, Jun 13, 2025 at 3:11 PM YoungJun Park <youngjun.park@lge.com> wrote:
> >
> > On Thu, Jun 12, 2025 at 01:08:08PM -0700, Nhat Pham wrote:
> > > On Thu, Jun 12, 2025 at 11:20 AM Kairui Song <ryncsn@gmail.com> wrote:
> > > >
> > > > On Fri, Jun 13, 2025 at 1:28 AM Nhat Pham <nphamcs@gmail.com> wrote:
> > > > >
> > > > > On Thu, Jun 12, 2025 at 4:14 AM Kairui Song <ryncsn@gmail.com> wrote:
> > > > > >
> > > > > > On Thu, Jun 12, 2025 at 6:43 PM <youngjun.park@lge.com> wrote:
> > > > > > >
> > > > > > > From: "youngjun.park" <youngjun.park@lge.com>
> > > > > > >
> > > > > >
> > > > > > Hi, Youngjun,
> > > > > >
> > > > > > Thanks for sharing this series.
> > > > > >
> > > > > > > This patch implements swap device selection and swap on/off propagation
> > > > > > > when a cgroup-specific swap priority is set.
> > > > > > >
> > > > > > > There is one workaround to this implementation as follows.
> > > > > > > Current per-cpu swap cluster enforces swap device selection based solely
> > > > > > > on CPU locality, overriding the swap cgroup's configured priorities.
> > > > > >
> > > > > > I've been thinking about this, we can switch to a per-cgroup-per-cpu
> > > > > > next cluster selector, the problem with current code is that swap
> > > > >
> > > > > What about per-cpu-per-order-per-swap-device :-? Number of swap
> > > > > devices is gonna be smaller than number of cgroups, right?
> > > >
> > > > Hi Nhat,
> > > >
> > > > The problem is per cgroup makes more sense (I was suggested to use
> > > > cgroup level locality at the very beginning of the implementation of
> > > > the allocator in the mail list, but it was hard to do so at that
> > > > time), for container environments, a cgroup is a container that runs
> > > > one type of workload, so it has its own locality. Things like systemd
> > > > also organize different desktop workloads into cgroups. The whole
> > > > point is about cgroup.
> > >
> > > Yeah I know what cgroup represents. Which is why I mentioned in the
> > > next paragraph that are still making decisions based per-cgroup - we
> > > just organize the per-cpu cache based on swap devices. This way, two
> > > cgroups with similar/same priority list can share the clusters, for
> > > each swapfile, in each CPU. There will be a lot less duplication and
> > > overhead. And two cgroups with different priority lists won't
> > > interfere with each other, since they'll target different swapfiles.
> > >
> > > Unless we want to nudge the swapfiles/clusters to be self-partitioned
> > > among the cgroups? :) IOW, each cluster contains pages mostly from a
> > > single cgroup (with some stranglers mixed in). I suppose that will be
> > > very useful for swap on rotational drives where read contiguity is
> > > imperative, but not sure about other backends :-?
> > > Anyway, no strong opinions to be completely honest :) Was just
> > > throwing out some ideas. Per-cgroup-per-cpu-per-order sounds good to
> > > me too, if it's easy to do.
> >
> > Good point!
> > I agree with the mention that self-partitioned clusters and duplicated priority.
> > One concern is the cost of synchronization.
> > Specifically the one incurred when accessing the prioritized swap device
> > From a simple performance perspective, a per-cgroup-per-CPU implementation
> > seems favorable - in line with the current swap allocation fastpath.
> >
> > It seems most reasonable to carefully compare the pros and cons of the
> > tow approaches.
> >
> > To summaraize,
> >
> > Option 1. per-cgroup-per-cpu
> > Pros: upstream fit. performance.
> > Cons: duplicate priority(some memory structure consumtion cost),
> > self partioned cluster
> >
> > Option 2. per-cpu-per-order(per-device)
> > Pros: Cons of Option1
> > Cons: Pros of Option1
> >
> > It's not easy to draw a definitive conclusion right away,
> > I should also evaluate other pros and cons that may arise during actual
> > implementation.
> > so I'd like to take some time to review things in more detail
> > and share my thoughs and conclusions in the next patch series.
> >
> > What do you think, Nhat and Kairui?
>
> Ah, I think what might be best fits here is, each cgroup have a pcp
> device list,  and each device have a pcp cluster list:
>
> folio -> mem_cgroup -> swap_priority (maybe a more generic name is
> better?) -> swap_device_pcp (recording only the *si per order)
> swap_device_info -> swap_cluster_pcp (cluster offset per order)

Sorry the truncate made this hard to read, let me try again:

folio ->
  mem_cgroup ->
    swap_priority (maybe a more generic name is better?) ->
      swap_device_pcp (recording only the *si per order)

And:
swap_device_info ->
  swap_cluster_pcp (cluster offset per order)

And if mem_cgroup -> swap_priority is NULL,
fallback to a global swap_device_pcp.

Re: [RFC PATCH 2/2] mm: swap: apply per cgroup swap priority mechansim on swap layer

Posted by YoungJun Park 3 months, 4 weeks ago

On Fri, Jun 13, 2025 at 03:38:37PM +0800, Kairui Song wrote:
> On Fri, Jun 13, 2025 at 3:36 PM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > On Fri, Jun 13, 2025 at 3:11 PM YoungJun Park <youngjun.park@lge.com> wrote:
> > >
> > > On Thu, Jun 12, 2025 at 01:08:08PM -0700, Nhat Pham wrote:
> > > > On Thu, Jun 12, 2025 at 11:20 AM Kairui Song <ryncsn@gmail.com> wrote:
> > > > >
> > > > > On Fri, Jun 13, 2025 at 1:28 AM Nhat Pham <nphamcs@gmail.com> wrote:
> > > > > >
> > > > > > On Thu, Jun 12, 2025 at 4:14 AM Kairui Song <ryncsn@gmail.com> wrote:
> > > > > > >
> > > > > > > On Thu, Jun 12, 2025 at 6:43 PM <youngjun.park@lge.com> wrote:
> > > > > > > >
> > > > > > > > From: "youngjun.park" <youngjun.park@lge.com>
> > > > > > > >
> > > > > > >
> > > > > > > Hi, Youngjun,
> > > > > > >
> > > > > > > Thanks for sharing this series.
> > > > > > >
> > > > > > > > This patch implements swap device selection and swap on/off propagation
> > > > > > > > when a cgroup-specific swap priority is set.
> > > > > > > >
> > > > > > > > There is one workaround to this implementation as follows.
> > > > > > > > Current per-cpu swap cluster enforces swap device selection based solely
> > > > > > > > on CPU locality, overriding the swap cgroup's configured priorities.
> > > > > > >
> > > > > > > I've been thinking about this, we can switch to a per-cgroup-per-cpu
> > > > > > > next cluster selector, the problem with current code is that swap
> > > > > >
> > > > > > What about per-cpu-per-order-per-swap-device :-? Number of swap
> > > > > > devices is gonna be smaller than number of cgroups, right?
> > > > >
> > > > > Hi Nhat,
> > > > >
> > > > > The problem is per cgroup makes more sense (I was suggested to use
> > > > > cgroup level locality at the very beginning of the implementation of
> > > > > the allocator in the mail list, but it was hard to do so at that
> > > > > time), for container environments, a cgroup is a container that runs
> > > > > one type of workload, so it has its own locality. Things like systemd
> > > > > also organize different desktop workloads into cgroups. The whole
> > > > > point is about cgroup.
> > > >
> > > > Yeah I know what cgroup represents. Which is why I mentioned in the
> > > > next paragraph that are still making decisions based per-cgroup - we
> > > > just organize the per-cpu cache based on swap devices. This way, two
> > > > cgroups with similar/same priority list can share the clusters, for
> > > > each swapfile, in each CPU. There will be a lot less duplication and
> > > > overhead. And two cgroups with different priority lists won't
> > > > interfere with each other, since they'll target different swapfiles.
> > > >
> > > > Unless we want to nudge the swapfiles/clusters to be self-partitioned
> > > > among the cgroups? :) IOW, each cluster contains pages mostly from a
> > > > single cgroup (with some stranglers mixed in). I suppose that will be
> > > > very useful for swap on rotational drives where read contiguity is
> > > > imperative, but not sure about other backends :-?
> > > > Anyway, no strong opinions to be completely honest :) Was just
> > > > throwing out some ideas. Per-cgroup-per-cpu-per-order sounds good to
> > > > me too, if it's easy to do.
> > >
> > > Good point!
> > > I agree with the mention that self-partitioned clusters and duplicated priority.
> > > One concern is the cost of synchronization.
> > > Specifically the one incurred when accessing the prioritized swap device
> > > From a simple performance perspective, a per-cgroup-per-CPU implementation
> > > seems favorable - in line with the current swap allocation fastpath.
> > >
> > > It seems most reasonable to carefully compare the pros and cons of the
> > > tow approaches.
> > >
> > > To summaraize,
> > >
> > > Option 1. per-cgroup-per-cpu
> > > Pros: upstream fit. performance.
> > > Cons: duplicate priority(some memory structure consumtion cost),
> > > self partioned cluster
> > >
> > > Option 2. per-cpu-per-order(per-device)
> > > Pros: Cons of Option1
> > > Cons: Pros of Option1
> > >
> > > It's not easy to draw a definitive conclusion right away,
> > > I should also evaluate other pros and cons that may arise during actual
> > > implementation.
> > > so I'd like to take some time to review things in more detail
> > > and share my thoughs and conclusions in the next patch series.
> > >
> > > What do you think, Nhat and Kairui?
> >
> > Ah, I think what might be best fits here is, each cgroup have a pcp
> > device list,  and each device have a pcp cluster list:
> >
> > folio -> mem_cgroup -> swap_priority (maybe a more generic name is
> > better?) -> swap_device_pcp (recording only the *si per order)
> > swap_device_info -> swap_cluster_pcp (cluster offset per order)
> 
> Sorry the truncate made this hard to read, let me try again:
> 
> folio ->
>   mem_cgroup ->
>     swap_priority (maybe a more generic name is better?) ->
>       swap_device_pcp (recording only the *si per order)
> 
> And:
> swap_device_info ->
>   swap_cluster_pcp (cluster offset per order)
> 
> And if mem_cgroup -> swap_priority is NULL,
> fallback to a global swap_device_pcp.

Thank you for quick and kind feedback. This is a really good idea :)
On my workaround proposal, I just need to add the swap_device_pcp part 
along with some refactoring.

And the naming swap_cgroup_priority...
I adopted the term "swap_cgorup_priority" based on the
perspective of the functionality I'm aiming to implement.
Here are some words that immediately come to mind.
(Like I said, just come to mind)
* swap_tier, swap_order, swap_selection, swap_cgroup_tier, swap_cgroup_order,
swap_cgroup_selection....

I'll try to come up with a more suitable conceptual name as I continue working
on the patch.
In the meantime, I'd appreciate any suggestions or feedback you may have.

Thanks again your feedback and suggestions.

Re: [RFC PATCH 2/2] mm: swap: apply per cgroup swap priority mechansim on swap layer

Posted by Kairui Song 4 months ago

On Thu, Jun 12, 2025 at 7:14 PM Kairui Song <ryncsn@gmail.com> wrote:
>
> On Thu, Jun 12, 2025 at 6:43 PM <youngjun.park@lge.com> wrote:
> >
> > From: "youngjun.park" <youngjun.park@lge.com>
> >
>
> Hi, Youngjun,
>
> Thanks for sharing this series.
>
> > This patch implements swap device selection and swap on/off propagation
> > when a cgroup-specific swap priority is set.
> >
> > There is one workaround to this implementation as follows.
> > Current per-cpu swap cluster enforces swap device selection based solely
> > on CPU locality, overriding the swap cgroup's configured priorities.
>
> I've been thinking about this, we can switch to a per-cgroup-per-cpu
> next cluster selector, the problem with current code is that swap
> allocator is not designed with folio / cgroup in mind at all, so it's
> really ugly to implement, which is why I have following two patches in
> the swap table series:
>
> https://lore.kernel.org/linux-mm/20250514201729.48420-18-ryncsn@gmail.com/
> https://lore.kernel.org/linux-mm/20250514201729.48420-22-ryncsn@gmail.com/

And BTW this is not the only reason, these two are also quite critical
to get rid of the swap_cgroup_ctrl later, and maybe switch to use
folio lock for more swap operations, etc..

> The first one makes all swap allocation starts with a folio, the
> second one makes the allocator always folio aware. So you can know
> which cgroup is doing the allocation at anytime inside the allocator
> (and it reduced the number of argument, also improving performance :)
> )
>
> So the allocator can just use cgroup's swap info if available, plist,
> percpu cluster, and fallback to global locality in a very natural way.
>
>
> > Therefore, when a swap cgroup priority is assigned, we fall back to
> > using per-CPU clusters per swap device, similar to the previous behavior.
> >
> > A proper fix for this workaround will be evaluated in the next patch.
>
> Hmm, but this is already the last patch in the series?
>
> >
> > Signed-off-by: Youngjun park <youngjun.park@lge.com>
> > ---
> >  include/linux/swap.h      |   8 +++
> >  mm/swap.h                 |   8 +++
> >  mm/swap_cgroup_priority.c | 133 ++++++++++++++++++++++++++++++++++++++
> >  mm/swapfile.c             | 125 ++++++++++++++++++++++++-----------
> >  4 files changed, 238 insertions(+), 36 deletions(-)
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 49b73911c1bd..d158b0d5c997 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -283,6 +283,13 @@ enum swap_cluster_flags {
> >  #define SWAP_NR_ORDERS         1
> >  #endif
> >
> > +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> > +struct percpu_cluster {
> > +       local_lock_t lock; /* Protect the percpu_cluster above */
> > +       unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
> > +};
> > +#endif
> > +
> >  /*
> >   * We keep using same cluster for rotational device so IO will be sequential.
> >   * The purpose is to optimize SWAP throughput on these device.
> > @@ -341,6 +348,7 @@ struct swap_info_struct {
> >         struct list_head discard_clusters; /* discard clusters list */
> >  #ifdef CONFIG_SWAP_CGROUP_PRIORITY
> >         int unique_id;
> > +       struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
> >  #endif
> >         struct plist_node avail_lists[]; /*
> >                                            * entries in swap_avail_heads, one
> > diff --git a/mm/swap.h b/mm/swap.h
> > index cd2649c632ed..cb6d653fe3f1 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -113,7 +113,15 @@ void delete_swap_cgroup_priority(struct mem_cgroup *memcg);
> >  void show_swap_device_unique_id(struct seq_file *m);
> >  #else
> >  static inline void delete_swap_cgroup_priority(struct mem_cgroup *memcg) {}
> > +static inline void activate_swap_cgroup_priority_pnode(struct swap_info_struct *swp, bool swapon) {}
> > +static inline void deactivate_swap_cgroup_priority_pnode(struct swap_info_struct *swp, bool swapoff){}
> >  static inline void get_swap_unique_id(struct swap_info_struct *si) {}
> > +static inline bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
> > +                               swp_entry_t *entry, int order)
> > +{
> > +       return false;
> > +}
> > +
> >  #endif
> >
> >  #else /* CONFIG_SWAP */
> > diff --git a/mm/swap_cgroup_priority.c b/mm/swap_cgroup_priority.c
> > index b3e20b676680..bb18cb251f60 100644
> > --- a/mm/swap_cgroup_priority.c
> > +++ b/mm/swap_cgroup_priority.c
> > @@ -54,6 +54,132 @@ static void get_swap_unique_id(struct swap_info_struct *si)
> >         si->unique_id = atomic_add_return(1, &swap_unique_id_counter);
> >  }
> >
> > +static bool swap_alloc_cgroup_priority(struct mem_cgroup *memcg,
> > +                               swp_entry_t *entry, int order)
> > +{
> > +       struct swap_cgroup_priority *swap_priority;
> > +       struct swap_cgroup_priority_pnode *pnode, *next;
> > +       unsigned long offset;
> > +       int node;
> > +
> > +       if (!memcg)
> > +               return false;
> > +
> > +       spin_lock(&swap_avail_lock);
> > +priority_check:
> > +       swap_priority = memcg->swap_priority;
> > +       if (!swap_priority) {
> > +               spin_unlock(&swap_avail_lock);
> > +               return false;
> > +       }
> > +
> > +       node = numa_node_id();
> > +start_over:
> > +       plist_for_each_entry_safe(pnode, next, &swap_priority->plist[node],
> > +                                       avail_lists[node]) {
> > +               struct swap_info_struct *si = pnode->swap;
> > +               plist_requeue(&pnode->avail_lists[node],
> > +                       &swap_priority->plist[node]);
> > +               spin_unlock(&swap_avail_lock);
> > +
> > +               if (get_swap_device_info(si)) {
> > +                       offset = cluster_alloc_swap_entry(si,
> > +                                       order, SWAP_HAS_CACHE, true);
> > +                       put_swap_device(si);
> > +                       if (offset) {
> > +                               *entry = swp_entry(si->type, offset);
> > +                               return true;
> > +                       }
> > +                       if (order)
> > +                               return false;
> > +               }
> > +
> > +               spin_lock(&swap_avail_lock);
> > +
> > +               /* swap_priority is remove or changed under us. */
> > +               if (swap_priority != memcg->swap_priority)
> > +                       goto priority_check;
> > +
> > +               if (plist_node_empty(&next->avail_lists[node]))
> > +                       goto start_over;
> > +       }
> > +       spin_unlock(&swap_avail_lock);
> > +
> > +       return false;
> > +}
> > +
> > +/* add_to_avail_list (swapon / swapusage > 0) */
> > +static void activate_swap_cgroup_priority_pnode(struct swap_info_struct *swp,
> > +                       bool swapon)
> > +{
> > +       struct swap_cgroup_priority *swap_priority;
> > +       int i;
> > +
> > +       list_for_each_entry(swap_priority, &swap_cgroup_priority_list, link) {
> > +               struct swap_cgroup_priority_pnode *pnode
> > +                       = swap_priority->pnode[swp->type];
> > +
> > +               if (swapon) {
> > +                       pnode->swap = swp;
> > +                       pnode->prio = swp->prio;
> > +               }
> > +
> > +               /* NUMA priority handling */
> > +               for_each_node(i) {
> > +                       if (swapon) {
> > +                               if (swap_node(swp) == i) {
> > +                                       plist_node_init(
> > +                                               &pnode->avail_lists[i],
> > +                                               1);
> > +                               } else {
> > +                                       plist_node_init(
> > +                                               &pnode->avail_lists[i],
> > +                                               -pnode->prio);
> > +                               }
> > +                       }
> > +
> > +                       plist_add(&pnode->avail_lists[i],
> > +                               &swap_priority->plist[i]);
> > +               }
> > +       }
> > +}
> > +
> > +/* del_from_avail_list (swapoff / swap usage <= 0) */
> > +static void deactivate_swap_cgroup_priority_pnode(struct swap_info_struct *swp,
> > +               bool swapoff)
> > +{
> > +       struct swap_cgroup_priority *swap_priority;
> > +       int nid, i;
> > +
> > +       list_for_each_entry(swap_priority, &swap_cgroup_priority_list, link) {
> > +               struct swap_cgroup_priority_pnode *pnode;
> > +
> > +               if (swapoff && swp->prio < 0) {
> > +                       /*
> > +                       * NUMA priority handling
> > +                       * mimic swapoff prio adjustment without plist
> > +                       */
> > +                       for (int i = 0; i < MAX_SWAPFILES; i++) {
> > +                               pnode = swap_priority->pnode[i];
> > +                               if (pnode->prio > swp->prio ||
> > +                                       pnode->swap == swp)
> > +                                       continue;
> > +
> > +                               pnode->prio++;
> > +                               for_each_node(nid) {
> > +                                       if (pnode->avail_lists[nid].prio != 1)
> > +                                               pnode->avail_lists[nid].prio--;
> > +                               }
> > +                       }
> > +               }
> > +
> > +               pnode = swap_priority->pnode[swp->type];
> > +               for_each_node(i)
> > +                       plist_del(&pnode->avail_lists[i],
> > +                               &swap_priority->plist[i]);
> > +       }
> > +}
> > +
> >  int create_swap_cgroup_priority(struct mem_cgroup *memcg,
> >                 int unique[], int prio[], int nr)
> >  {
> > @@ -183,6 +309,12 @@ void delete_swap_cgroup_priority(struct mem_cgroup *memcg)
> >  {
> >         struct swap_cgroup_priority *swap_priority;
> >
> > +       /*
> > +       * XXX: Possible RCU wait? No. Cannot protect priority list addition.
> > +       * swap_avail_lock gives protection.
> > +       * Think about other object protection mechanism
> > +       * might be solve it and better. (e.g object reference)
> > +       */
> >         spin_lock(&swap_avail_lock);
> >         swap_priority = memcg->swap_priority;
> >         if (!swap_priority) {
> > @@ -198,5 +330,6 @@ void delete_swap_cgroup_priority(struct mem_cgroup *memcg)
> >
> >         for (int i = 0; i < MAX_SWAPFILES; i++)
> >                 kvfree(swap_priority->pnode[i]);
> > +
> >         kvfree(swap_priority);
> >  }
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index f8e48dd2381e..28afe4ec0504 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -126,8 +126,12 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
> >         .offset = { SWAP_ENTRY_INVALID },
> >         .lock = INIT_LOCAL_LOCK(),
> >  };
> > -/* TODO: better choice? */
> > +/* TODO: better arrangement */
> >  #ifdef CONFIG_SWAP_CGROUP_PRIORITY
> > +static bool get_swap_device_info(struct swap_info_struct *si);
> > +static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
> > +                                             unsigned char usage, bool is_cgroup_priority);
> > +static int swap_node(struct swap_info_struct *si);
> >  #include "swap_cgroup_priority.c"
> >  #endif
> >
> > @@ -776,7 +780,8 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
> >                                             struct swap_cluster_info *ci,
> >                                             unsigned long offset,
> >                                             unsigned int order,
> > -                                           unsigned char usage)
> > +                                           unsigned char usage,
> > +                                           bool is_cgroup_priority)
> >  {
> >         unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
> >         unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER);
> > @@ -820,12 +825,19 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
> >  out:
> >         relocate_cluster(si, ci);
> >         unlock_cluster(ci);
> > +
> >         if (si->flags & SWP_SOLIDSTATE) {
> > -               this_cpu_write(percpu_swap_cluster.offset[order], next);
> > -               this_cpu_write(percpu_swap_cluster.si[order], si);
> > -       } else {
> > +               if (!is_cgroup_priority) {
> > +                       this_cpu_write(percpu_swap_cluster.offset[order], next);
> > +                       this_cpu_write(percpu_swap_cluster.si[order], si);
> > +               } else {
> > +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> > +                       __this_cpu_write(si->percpu_cluster->next[order], next);
> > +#endif
> > +               }
> > +       } else
> >                 si->global_cluster->next[order] = next;
> > -       }
> > +
> >         return found;
> >  }
> >
> > @@ -883,7 +895,7 @@ static void swap_reclaim_work(struct work_struct *work)
> >   * cluster for current CPU too.
> >   */
> >  static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
> > -                                             unsigned char usage)
> > +                                             unsigned char usage, bool is_cgroup_priority)
> >  {
> >         struct swap_cluster_info *ci;
> >         unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
> > @@ -895,32 +907,38 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> >         if (order && !(si->flags & SWP_BLKDEV))
> >                 return 0;
> >
> > -       if (!(si->flags & SWP_SOLIDSTATE)) {
> > +       if (si->flags & SWP_SOLIDSTATE) {
> > +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> > +                local_lock(&si->percpu_cluster->lock);
> > +                offset = __this_cpu_read(si->percpu_cluster->next[order]);
> > +#endif
> > +       } else {
> >                 /* Serialize HDD SWAP allocation for each device. */
> >                 spin_lock(&si->global_cluster_lock);
> >                 offset = si->global_cluster->next[order];
> > -               if (offset == SWAP_ENTRY_INVALID)
> > -                       goto new_cluster;
> > +       }
> >
> > -               ci = lock_cluster(si, offset);
> > -               /* Cluster could have been used by another order */
> > -               if (cluster_is_usable(ci, order)) {
> > -                       if (cluster_is_empty(ci))
> > -                               offset = cluster_offset(si, ci);
> > -                       found = alloc_swap_scan_cluster(si, ci, offset,
> > -                                                       order, usage);
> > -               } else {
> > -                       unlock_cluster(ci);
> > -               }
> > -               if (found)
> > -                       goto done;
> > +       if (offset == SWAP_ENTRY_INVALID)
> > +               goto new_cluster;
> > +
> > +       ci = lock_cluster(si, offset);
> > +       /* Cluster could have been used by another order */
> > +       if (cluster_is_usable(ci, order)) {
> > +               if (cluster_is_empty(ci))
> > +                       offset = cluster_offset(si, ci);
> > +               found = alloc_swap_scan_cluster(si, ci, offset,
> > +                                               order, usage, is_cgroup_priority);
> > +       } else {
> > +               unlock_cluster(ci);
> >         }
> > +       if (found)
> > +               goto done;
> >
> >  new_cluster:
> >         ci = isolate_lock_cluster(si, &si->free_clusters);
> >         if (ci) {
> >                 found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> > -                                               order, usage);
> > +                                               order, usage, is_cgroup_priority);
> >                 if (found)
> >                         goto done;
> >         }
> > @@ -934,7 +952,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> >
> >                 while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
> >                         found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> > -                                                       order, usage);
> > +                                                       order, usage, is_cgroup_priority);
> >                         if (found)
> >                                 goto done;
> >                         /* Clusters failed to allocate are moved to frag_clusters */
> > @@ -952,7 +970,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> >                          * reclaimable (eg. lazy-freed swap cache) slots.
> >                          */
> >                         found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> > -                                                       order, usage);
> > +                                                       order, usage, is_cgroup_priority);
> >                         if (found)
> >                                 goto done;
> >                         frags++;
> > @@ -979,21 +997,27 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
> >                 while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
> >                         atomic_long_dec(&si->frag_cluster_nr[o]);
> >                         found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> > -                                                       0, usage);
> > +                                                       0, usage, is_cgroup_priority);
> >                         if (found)
> >                                 goto done;
> >                 }
> >
> >                 while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) {
> >                         found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
> > -                                                       0, usage);
> > +                                                       0, usage, is_cgroup_priority);
> >                         if (found)
> >                                 goto done;
> >                 }
> >         }
> >  done:
> > -       if (!(si->flags & SWP_SOLIDSTATE))
> > +       if (si->flags & SWP_SOLIDSTATE) {
> > +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> > +               local_unlock(&si->percpu_cluster->lock);
> > +#endif
> > +       } else {
> >                 spin_unlock(&si->global_cluster_lock);
> > +       }
> > +
> >         return found;
> >  }
> >
> > @@ -1032,6 +1056,7 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
> >         for_each_node(nid)
> >                 plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
> >
> > +       deactivate_swap_cgroup_priority_pnode(si, swapoff);
> >  skip:
> >         spin_unlock(&swap_avail_lock);
> >  }
> > @@ -1075,6 +1100,7 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
> >         for_each_node(nid)
> >                 plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
> >
> > +       activate_swap_cgroup_priority_pnode(si, swapon);
> >  skip:
> >         spin_unlock(&swap_avail_lock);
> >  }
> > @@ -1200,7 +1226,8 @@ static bool swap_alloc_fast(swp_entry_t *entry,
> >         if (cluster_is_usable(ci, order)) {
> >                 if (cluster_is_empty(ci))
> >                         offset = cluster_offset(si, ci);
> > -               found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE);
> > +               found = alloc_swap_scan_cluster(si, ci, offset, order,
> > +                               SWAP_HAS_CACHE, false);
> >                 if (found)
> >                         *entry = swp_entry(si->type, found);
> >         } else {
> > @@ -1227,7 +1254,7 @@ static bool swap_alloc_slow(swp_entry_t *entry,
> >                 plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
> >                 spin_unlock(&swap_avail_lock);
> >                 if (get_swap_device_info(si)) {
> > -                       offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
> > +                       offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE, false);
> >                         put_swap_device(si);
> >                         if (offset) {
> >                                 *entry = swp_entry(si->type, offset);
> > @@ -1294,10 +1321,12 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
> >                 }
> >         }
> >
> > -       local_lock(&percpu_swap_cluster.lock);
> > -       if (!swap_alloc_fast(&entry, order))
> > -               swap_alloc_slow(&entry, order);
> > -       local_unlock(&percpu_swap_cluster.lock);
> > +       if (!swap_alloc_cgroup_priority(folio_memcg(folio), &entry, order)) {
> > +               local_lock(&percpu_swap_cluster.lock);
> > +               if (!swap_alloc_fast(&entry, order))
> > +                       swap_alloc_slow(&entry, order);
> > +               local_unlock(&percpu_swap_cluster.lock);
> > +       }
> >
> >         /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
> >         if (mem_cgroup_try_charge_swap(folio, entry))
> > @@ -1870,7 +1899,7 @@ swp_entry_t get_swap_page_of_type(int type)
> >         /* This is called for allocating swap entry, not cache */
> >         if (get_swap_device_info(si)) {
> >                 if (si->flags & SWP_WRITEOK) {
> > -                       offset = cluster_alloc_swap_entry(si, 0, 1);
> > +                       offset = cluster_alloc_swap_entry(si, 0, 1, false);
> >                         if (offset) {
> >                                 entry = swp_entry(si->type, offset);
> >                                 atomic_long_dec(&nr_swap_pages);
> > @@ -2800,6 +2829,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
> >         arch_swap_invalidate_area(p->type);
> >         zswap_swapoff(p->type);
> >         mutex_unlock(&swapon_mutex);
> > +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> > +       free_percpu(p->percpu_cluster);
> > +       p->percpu_cluster = NULL;
> > +#endif
> >         kfree(p->global_cluster);
> >         p->global_cluster = NULL;
> >         vfree(swap_map);
> > @@ -3207,7 +3240,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
> >         for (i = 0; i < nr_clusters; i++)
> >                 spin_lock_init(&cluster_info[i].lock);
> >
> > -       if (!(si->flags & SWP_SOLIDSTATE)) {
> > +       if (si->flags & SWP_SOLIDSTATE) {
> > +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> > +               si->percpu_cluster = alloc_percpu(struct percpu_cluster);
> > +               if (!si->percpu_cluster)
> > +                       goto err_free;
> > +
> > +               int cpu;
> > +               for_each_possible_cpu(cpu) {
> > +                       struct percpu_cluster *cluster;
> > +
> > +                       cluster = per_cpu_ptr(si->percpu_cluster, cpu);
> > +                       for (i = 0; i < SWAP_NR_ORDERS; i++)
> > +                               cluster->next[i] = SWAP_ENTRY_INVALID;
> > +                       local_lock_init(&cluster->lock);
> > +               }
> > +#endif
> > +       } else {
> >                 si->global_cluster = kmalloc(sizeof(*si->global_cluster),
> >                                      GFP_KERNEL);
> >                 if (!si->global_cluster)
> > @@ -3495,6 +3544,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
> >  bad_swap_unlock_inode:
> >         inode_unlock(inode);
> >  bad_swap:
> > +#ifdef CONFIG_SWAP_CGROUP_PRIORITY
> > +       free_percpu(si->percpu_cluster);
> > +       si->percpu_cluster = NULL;
> > +#endif
> >         kfree(si->global_cluster);
> >         si->global_cluster = NULL;
> >         inode = NULL;
> > --
> > 2.34.1
> >
> >

[RFC PATCH 1/2] mm/swap, memcg: basic structure and logic for per cgroup swap priority control
[RFC PATCH 2/2] mm: swap: apply per cgroup swap priority mechansim on swap layer