[v1] mm, swap: introduce swap table as swap cache (phase I)

[PATCH 8/9] mm, swap: implement dynamic allocation of swap table

Posted by Kairui Song 1 month, 1 week ago

From: Kairui Song <kasong@tencent.com>

Now swap table is cluster based, which means free clusters can free its
table since no one should modify it.

There could be speculative readers, like swap cache look up, protect
them by making them RCU safe. All swap table should be filled with null
entries before free, so such readers will either see a NULL pointer or
a null filled table being lazy freed.

On allocation, allocate the table when a cluster is used by any order.

This way, we can reduce the memory usage of large swap device
significantly.

This idea to dynamically release unused swap cluster data was initially
suggested by Chris Li while proposing the cluster swap allocator and
I found it suits the swap table idea very well.

Co-developed-by: Chris Li <chrisl@kernel.org>
Signed-off-by: Chris Li <chrisl@kernel.org>
Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/swap.h       |   2 +-
 mm/swap_state.c |   9 ++-
 mm/swap_table.h |  32 +++++++-
 mm/swapfile.c   | 202 ++++++++++++++++++++++++++++++++++++++----------
 4 files changed, 197 insertions(+), 48 deletions(-)

diff --git a/mm/swap.h b/mm/swap.h
index ce3ec62cc05e..ee33733027f4 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -36,7 +36,7 @@ struct swap_cluster_info {
 	u16 count;
 	u8 flags;
 	u8 order;
-	atomic_long_t *table;	/* Swap table entries, see mm/swap_table.h */
+	atomic_long_t __rcu *table;	/* Swap table entries, see mm/swap_table.h */
 	struct list_head list;
 };
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c0342024b4a8..a0120d822fbe 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -87,7 +87,8 @@ struct folio *swap_cache_get_folio(swp_entry_t entry)
 	struct folio *folio;
 
 	for (;;) {
-		swp_tb = __swap_table_get(swp_cluster(entry), swp_cluster_offset(entry));
+		swp_tb = swap_table_get(swp_cluster(entry),
+					swp_cluster_offset(entry));
 		if (!swp_tb_is_folio(swp_tb))
 			return NULL;
 		folio = swp_tb_to_folio(swp_tb);
@@ -107,10 +108,9 @@ void *swap_cache_get_shadow(swp_entry_t entry)
 {
 	unsigned long swp_tb;
 
-	swp_tb = __swap_table_get(swp_cluster(entry), swp_cluster_offset(entry));
+	swp_tb = swap_table_get(swp_cluster(entry), swp_cluster_offset(entry));
 	if (swp_tb_is_shadow(swp_tb))
 		return swp_tb_to_shadow(swp_tb);
-
 	return NULL;
 }
 
@@ -135,6 +135,9 @@ int swap_cache_add_folio(swp_entry_t entry, struct folio *folio, void **shadowp)
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio);
 
 	ci = swap_cluster_lock(swp_info(entry), swp_offset(entry));
+	if (unlikely(!ci->table))
+		goto fail;
+
 	ci_start = swp_cluster_offset(entry);
 	ci_end = ci_start + nr_pages;
 	ci_off = ci_start;
diff --git a/mm/swap_table.h b/mm/swap_table.h
index ed9676547071..4e97513b11ef 100644
--- a/mm/swap_table.h
+++ b/mm/swap_table.h
@@ -2,8 +2,15 @@
 #ifndef _MM_SWAP_TABLE_H
 #define _MM_SWAP_TABLE_H
 
+#include <linux/rcupdate.h>
+#include <linux/atomic.h>
 #include "swap.h"
 
+/* A typical flat array in each cluster as swap table */
+struct swap_table {
+	atomic_long_t entries[SWAPFILE_CLUSTER];
+};
+
 /*
  * A swap table entry represents the status of a swap slot on a swap
  * (physical or virtual) device. The swap table in each cluster is a
@@ -76,15 +83,36 @@ static inline void *swp_tb_to_shadow(unsigned long swp_tb)
 static inline void __swap_table_set(struct swap_cluster_info *ci,
 				    unsigned int off, unsigned long swp_tb)
 {
+	atomic_long_t *table = rcu_dereference_protected(ci->table, true);
+
+	lockdep_assert_held(&ci->lock);
 	VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
-	atomic_long_set(&ci->table[off], swp_tb);
+	atomic_long_set(&table[off], swp_tb);
 }
 
 static inline unsigned long __swap_table_get(struct swap_cluster_info *ci,
 					     unsigned int off)
 {
+	atomic_long_t *table;
+
 	VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
-	return atomic_long_read(&ci->table[off]);
+	table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock));
+
+	return atomic_long_read(&table[off]);
+}
+
+static inline unsigned long swap_table_get(struct swap_cluster_info *ci,
+					unsigned int off)
+{
+	atomic_long_t *table;
+	unsigned long swp_tb;
+
+	rcu_read_lock();
+	table = rcu_dereference(ci->table);
+	swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb();
+	rcu_read_unlock();
+
+	return swp_tb;
 }
 
 static inline void __swap_table_set_folio(struct swap_cluster_info *ci,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0c8001c99f30..00651e947eb2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -105,6 +105,8 @@ static DEFINE_SPINLOCK(swap_avail_lock);
 
 struct swap_info_struct *swap_info[MAX_SWAPFILES];
 
+static struct kmem_cache *swap_table_cachep;
+
 static DEFINE_MUTEX(swapon_mutex);
 
 static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
@@ -402,10 +404,17 @@ static inline bool cluster_is_discard(struct swap_cluster_info *info)
 	return info->flags == CLUSTER_FLAG_DISCARD;
 }
 
+static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci)
+{
+	return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock));
+}
+
 static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
 {
 	if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
 		return false;
+	if (!cluster_table_is_alloced(ci))
+		return false;
 	if (!order)
 		return true;
 	return cluster_is_empty(ci) || order == ci->order;
@@ -423,32 +432,98 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si,
 	return cluster_index(si, ci) * SWAPFILE_CLUSTER;
 }
 
-static int swap_table_alloc_table(struct swap_cluster_info *ci)
+static void swap_cluster_free_table(struct swap_cluster_info *ci)
 {
-	WARN_ON(ci->table);
-	ci->table = kzalloc(sizeof(unsigned long) * SWAPFILE_CLUSTER, GFP_KERNEL);
-	if (!ci->table)
-		return -ENOMEM;
-	return 0;
+	unsigned int ci_off;
+	struct swap_table *table;
+
+	/* Only empty cluster's table is allow to be freed  */
+	lockdep_assert_held(&ci->lock);
+	VM_WARN_ON_ONCE(!cluster_is_empty(ci));
+	for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++)
+		VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off)));
+	table = (void *)rcu_dereference_protected(ci->table, true);
+	rcu_assign_pointer(ci->table, NULL);
+
+	kmem_cache_free(swap_table_cachep, table);
 }
 
-static void swap_cluster_free_table(struct swap_cluster_info *ci)
+/*
+ * Allocate a swap table may need to sleep, which leads to migration,
+ * so attempt an atomic allocation first then fallback and handle
+ * potential race.
+ */
+static struct swap_cluster_info *
+swap_cluster_alloc_table(struct swap_info_struct *si,
+			 struct swap_cluster_info *ci,
+			 int order)
 {
-	unsigned int ci_off;
-	unsigned long swp_tb;
+	struct swap_cluster_info *pcp_ci;
+	struct swap_table *table;
+	unsigned long offset;
 
-	if (!ci->table)
-		return;
+	/*
+	 * Only cluster isolation from the allocator does table allocation.
+	 * Swap allocator uses a percpu cluster and holds the local lock.
+	 */
+	lockdep_assert_held(&ci->lock);
+	lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
+
+	table = kmem_cache_zalloc(swap_table_cachep,
+				  __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+	if (table) {
+		rcu_assign_pointer(ci->table, table);
+		return ci;
+	}
+
+	/*
+	 * Try a sleep allocation. Each isolated free cluster may cause
+	 * a sleep allocation, but there is a limited number of them, so
+	 * the potential recursive allocation should be limited.
+	 */
+	spin_unlock(&ci->lock);
+	if (!(si->flags & SWP_SOLIDSTATE))
+		spin_unlock(&si->global_cluster_lock);
+	local_unlock(&percpu_swap_cluster.lock);
+	table = kmem_cache_zalloc(swap_table_cachep, __GFP_HIGH | GFP_KERNEL);
 
-	for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) {
-		swp_tb = __swap_table_get(ci, ci_off);
-		if (!swp_tb_is_null(swp_tb))
-			pr_err_once("swap: unclean swap space on swapoff: 0x%lx",
-				    swp_tb);
+	local_lock(&percpu_swap_cluster.lock);
+	if (!(si->flags & SWP_SOLIDSTATE))
+		spin_lock(&si->global_cluster_lock);
+	/*
+	 * Back to atomic context. First, check if we migrated to a new
+	 * CPU with a usable percpu cluster. If so, try using that instead.
+	 * No need to check it for the spinning device, as swap is
+	 * serialized by the global lock on them.
+	 *
+	 * The is_usable check is a bit rough, but ensures order 0 success.
+	 */
+	offset = this_cpu_read(percpu_swap_cluster.offset[order]);
+	if ((si->flags & SWP_SOLIDSTATE) && offset) {
+		pcp_ci = swap_cluster_lock(si, offset);
+		if (cluster_is_usable(pcp_ci, order) &&
+		    pcp_ci->count < SWAPFILE_CLUSTER) {
+			ci = pcp_ci;
+			goto free_table;
+		}
+		swap_cluster_unlock(pcp_ci);
 	}
 
-	kfree(ci->table);
-	ci->table = NULL;
+	if (!table)
+		return NULL;
+
+	spin_lock(&ci->lock);
+	/* Nothing should have touched the dangling empty cluster. */
+	if (WARN_ON_ONCE(cluster_table_is_alloced(ci)))
+		goto free_table;
+
+	rcu_assign_pointer(ci->table, table);
+	return ci;
+
+free_table:
+	if (table)
+		kmem_cache_free(swap_table_cachep, table);
+	return ci;
 }
 
 static void move_cluster(struct swap_info_struct *si,
@@ -480,7 +555,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
 
 static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
 {
-	lockdep_assert_held(&ci->lock);
+	swap_cluster_free_table(ci);
 	move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
 	ci->order = 0;
 }
@@ -495,15 +570,11 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info
  * this returns NULL for an non-empty list.
  */
 static struct swap_cluster_info *isolate_lock_cluster(
-		struct swap_info_struct *si, struct list_head *list)
+		struct swap_info_struct *si, struct list_head *list, int order)
 {
-	struct swap_cluster_info *ci, *ret = NULL;
+	struct swap_cluster_info *ci, *found = NULL;
 
 	spin_lock(&si->lock);
-
-	if (unlikely(!(si->flags & SWP_WRITEOK)))
-		goto out;
-
 	list_for_each_entry(ci, list, list) {
 		if (!spin_trylock(&ci->lock))
 			continue;
@@ -515,13 +586,19 @@ static struct swap_cluster_info *isolate_lock_cluster(
 
 		list_del(&ci->list);
 		ci->flags = CLUSTER_FLAG_NONE;
-		ret = ci;
+		found = ci;
 		break;
 	}
-out:
 	spin_unlock(&si->lock);
 
-	return ret;
+	if (found && !cluster_table_is_alloced(found)) {
+		/* Only an empty free cluster's swap table can be freed. */
+		VM_WARN_ON_ONCE(list != &si->free_clusters);
+		VM_WARN_ON_ONCE(!cluster_is_empty(found));
+		return swap_cluster_alloc_table(si, found, order);
+	}
+
+	return found;
 }
 
 /*
@@ -654,17 +731,27 @@ static void relocate_cluster(struct swap_info_struct *si,
  * added to free cluster list and its usage counter will be increased by 1.
  * Only used for initialization.
  */
-static void inc_cluster_info_page(struct swap_info_struct *si,
+static int inc_cluster_info_page(struct swap_info_struct *si,
 	struct swap_cluster_info *cluster_info, unsigned long page_nr)
 {
 	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+	struct swap_table *table;
 	struct swap_cluster_info *ci;
 
 	ci = cluster_info + idx;
+	if (!ci->table) {
+		table = kmem_cache_zalloc(swap_table_cachep, GFP_KERNEL);
+		if (!table)
+			return -ENOMEM;
+		rcu_assign_pointer(ci->table, table);
+	}
+
 	ci->count++;
 
 	VM_BUG_ON(ci->count > SWAPFILE_CLUSTER);
 	VM_BUG_ON(ci->flags);
+
+	return 0;
 }
 
 static bool cluster_reclaim_range(struct swap_info_struct *si,
@@ -845,7 +932,7 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
 	unsigned int found = SWAP_ENTRY_INVALID;
 
 	do {
-		struct swap_cluster_info *ci = isolate_lock_cluster(si, list);
+		struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order);
 		unsigned long offset;
 
 		if (!ci)
@@ -870,7 +957,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
 	if (force)
 		to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
 
-	while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
+	while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) {
 		offset = cluster_offset(si, ci);
 		end = min(si->max, offset + SWAPFILE_CLUSTER);
 		to_scan--;
@@ -1018,6 +1105,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
 done:
 	if (!(si->flags & SWP_SOLIDSTATE))
 		spin_unlock(&si->global_cluster_lock);
+
 	return found;
 }
 
@@ -1885,7 +1973,13 @@ swp_entry_t get_swap_page_of_type(int type)
 	/* This is called for allocating swap entry, not cache */
 	if (get_swap_device_info(si)) {
 		if (si->flags & SWP_WRITEOK) {
+			/*
+			 * Grab the local lock to be complaint
+			 * with swap table allocation.
+			 */
+			local_lock(&percpu_swap_cluster.lock);
 			offset = cluster_alloc_swap_entry(si, 0, 1);
+			local_unlock(&percpu_swap_cluster.lock);
 			if (offset) {
 				entry = swp_entry(si->type, offset);
 				atomic_long_dec(&nr_swap_pages);
@@ -2678,12 +2772,21 @@ static void wait_for_allocation(struct swap_info_struct *si)
 static void free_cluster_info(struct swap_cluster_info *cluster_info,
 			      unsigned long maxpages)
 {
+	struct swap_cluster_info *ci;
 	int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
 
 	if (!cluster_info)
 		return;
-	for (i = 0; i < nr_clusters; i++)
-		swap_cluster_free_table(&cluster_info[i]);
+	for (i = 0; i < nr_clusters; i++) {
+		ci = cluster_info + i;
+		/* Cluster with bad marks count will have a remaining table */
+		spin_lock(&ci->lock);
+		if (rcu_dereference_protected(ci->table, true)) {
+			ci->count = 0;
+			swap_cluster_free_table(ci);
+		}
+		spin_unlock(&ci->lock);
+	}
 	kvfree(cluster_info);
 }
 
@@ -2719,6 +2822,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	struct address_space *mapping;
 	struct inode *inode;
 	struct filename *pathname;
+	unsigned int maxpages;
 	int err, found = 0;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -2825,8 +2929,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	p->swap_map = NULL;
 	zeromap = p->zeromap;
 	p->zeromap = NULL;
+	maxpages = p->max;
 	cluster_info = p->cluster_info;
-	free_cluster_info(cluster_info, p->max);
 	p->max = 0;
 	p->cluster_info = NULL;
 	spin_unlock(&p->lock);
@@ -2838,6 +2942,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	p->global_cluster = NULL;
 	vfree(swap_map);
 	kvfree(zeromap);
+	free_cluster_info(cluster_info, maxpages);
 	/* Destroy swap account information */
 	swap_cgroup_swapoff(p->type);
 
@@ -3216,11 +3321,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
 	if (!cluster_info)
 		goto err;
 
-	for (i = 0; i < nr_clusters; i++) {
+	for (i = 0; i < nr_clusters; i++)
 		spin_lock_init(&cluster_info[i].lock);
-		if (swap_table_alloc_table(&cluster_info[i]))
-			goto err_free;
-	}
 
 	if (!(si->flags & SWP_SOLIDSTATE)) {
 		si->global_cluster = kmalloc(sizeof(*si->global_cluster),
@@ -3239,16 +3341,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
 	 * See setup_swap_map(): header page, bad pages,
 	 * and the EOF part of the last cluster.
 	 */
-	inc_cluster_info_page(si, cluster_info, 0);
+	err = inc_cluster_info_page(si, cluster_info, 0);
+	if (err)
+		goto err;
 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
 		unsigned int page_nr = swap_header->info.badpages[i];
 
 		if (page_nr >= maxpages)
 			continue;
-		inc_cluster_info_page(si, cluster_info, page_nr);
+		err = inc_cluster_info_page(si, cluster_info, page_nr);
+		if (err)
+			goto err;
+	}
+	for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) {
+		err = inc_cluster_info_page(si, cluster_info, i);
+		if (err)
+			goto err;
 	}
-	for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
-		inc_cluster_info_page(si, cluster_info, i);
 
 	INIT_LIST_HEAD(&si->free_clusters);
 	INIT_LIST_HEAD(&si->full_clusters);
@@ -3962,6 +4071,15 @@ static int __init swapfile_init(void)
 
 	swapfile_maximum_size = arch_max_swapfile_size();
 
+	/*
+	 * Once a cluster is freed, it's swap table content is read
+	 * only, and all swap cache readers (swap_cache_*) verifies
+	 * the content before use. So it's safe to use RCU slab here.
+	 */
+	swap_table_cachep = kmem_cache_create("swap_table",
+			    sizeof(struct swap_table),
+			    0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL);
+
 #ifdef CONFIG_MIGRATION
 	if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
 		swap_migration_ad_supported = true;
-- 
2.51.0

Re: [PATCH 8/9] mm, swap: implement dynamic allocation of swap table

Posted by Barry Song 1 month ago

On Sat, Aug 23, 2025 at 3:21 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> Now swap table is cluster based, which means free clusters can free its
> table since no one should modify it.
>
> There could be speculative readers, like swap cache look up, protect
> them by making them RCU safe. All swap table should be filled with null
> entries before free, so such readers will either see a NULL pointer or
> a null filled table being lazy freed.
>
> On allocation, allocate the table when a cluster is used by any order.
>

Might be a silly question.

Just curious—what happens if the allocation fails? Does the swap-out
operation also fail? We sometimes encounter strange issues when memory is
very limited, especially if the reclamation path itself needs to allocate
memory.

Assume a case where we want to swap out a folio using clusterN. We then
attempt to swap out the following folios with the same clusterN. But if
the allocation of the swap_table keeps failing, what will happen?

> This way, we can reduce the memory usage of large swap device
> significantly.
>
> This idea to dynamically release unused swap cluster data was initially
> suggested by Chris Li while proposing the cluster swap allocator and
> I found it suits the swap table idea very well.
>

Thanks
Barry

Re: [PATCH 8/9] mm, swap: implement dynamic allocation of swap table

Posted by Chris Li 1 month ago

On Tue, Sep 2, 2025 at 4:15 AM Barry Song <21cnbao@gmail.com> wrote:
>
> On Sat, Aug 23, 2025 at 3:21 AM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > Now swap table is cluster based, which means free clusters can free its
> > table since no one should modify it.
> >
> > There could be speculative readers, like swap cache look up, protect
> > them by making them RCU safe. All swap table should be filled with null
> > entries before free, so such readers will either see a NULL pointer or
> > a null filled table being lazy freed.
> >
> > On allocation, allocate the table when a cluster is used by any order.
> >
>
> Might be a silly question.
>
> Just curious—what happens if the allocation fails? Does the swap-out
> operation also fail? We sometimes encounter strange issues when memory is
> very limited, especially if the reclamation path itself needs to allocate
> memory.
>
> Assume a case where we want to swap out a folio using clusterN. We then
> attempt to swap out the following folios with the same clusterN. But if
> the allocation of the swap_table keeps failing, what will happen?

I think this is the same behavior as the XArray allocation node with no memory.
The swap allocator will fail to isolate this cluster, it gets a NULL
ci pointer as return value. The swap allocator will try other cluster
lists, e.g. non_full, fragment etc.
If all of them fail, the folio_alloc_swap() will return -ENOMEM. Which
will propagate back to the try to swap out, then the shrink folio
list. It will put this page back to the LRU.

The shrink folio list either free enough memory (happy path) or not
able to free enough memory and it will cause an OOM kill.

I believe previously XArray will also return -ENOMEM at insert a
pointer and not be able to allocate a node to hold that ponter. It has
the same error poperation path. We did not change that.

Chris

Re: [PATCH 8/9] mm, swap: implement dynamic allocation of swap table

Posted by Barry Song 1 month ago

On Wed, Sep 3, 2025 at 1:17 AM Chris Li <chrisl@kernel.org> wrote:
>
> On Tue, Sep 2, 2025 at 4:15 AM Barry Song <21cnbao@gmail.com> wrote:
> >
> > On Sat, Aug 23, 2025 at 3:21 AM Kairui Song <ryncsn@gmail.com> wrote:
> > >
> > > From: Kairui Song <kasong@tencent.com>
> > >
> > > Now swap table is cluster based, which means free clusters can free its
> > > table since no one should modify it.
> > >
> > > There could be speculative readers, like swap cache look up, protect
> > > them by making them RCU safe. All swap table should be filled with null
> > > entries before free, so such readers will either see a NULL pointer or
> > > a null filled table being lazy freed.
> > >
> > > On allocation, allocate the table when a cluster is used by any order.
> > >
> >
> > Might be a silly question.
> >
> > Just curious—what happens if the allocation fails? Does the swap-out
> > operation also fail? We sometimes encounter strange issues when memory is
> > very limited, especially if the reclamation path itself needs to allocate
> > memory.
> >
> > Assume a case where we want to swap out a folio using clusterN. We then
> > attempt to swap out the following folios with the same clusterN. But if
> > the allocation of the swap_table keeps failing, what will happen?
>
> I think this is the same behavior as the XArray allocation node with no memory.
> The swap allocator will fail to isolate this cluster, it gets a NULL
> ci pointer as return value. The swap allocator will try other cluster
> lists, e.g. non_full, fragment etc.

What I’m actually concerned about is that we keep iterating on this
cluster. If we try others, that sounds good.

> If all of them fail, the folio_alloc_swap() will return -ENOMEM. Which
> will propagate back to the try to swap out, then the shrink folio
> list. It will put this page back to the LRU.
>
> The shrink folio list either free enough memory (happy path) or not
> able to free enough memory and it will cause an OOM kill.
>
> I believe previously XArray will also return -ENOMEM at insert a
> pointer and not be able to allocate a node to hold that ponter. It has
> the same error poperation path. We did not change that.

Yes, I agree there was an -ENOMEM, but the difference is that we
are allocating much larger now :-)

One option is to organize every 4 or 8 swap slots into a group for
allocating or freeing the swap table. This way, we avoid the worst
case where a single unfreed slot consumes a whole swap table, and
the allocation size also becomes smaller. However, it’s unclear
whether the memory savings justify the added complexity and effort.

Anyway, I’m glad to see the current swap_table moving towards merge
and look forward to running it on various devices. This should help
us see if it causes any real issues.

Thanks
Barry

Re: [PATCH 8/9] mm, swap: implement dynamic allocation of swap table

Posted by Chris Li 1 month ago

On Tue, Sep 2, 2025 at 4:31 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Wed, Sep 3, 2025 at 1:17 AM Chris Li <chrisl@kernel.org> wrote:
> >
> > On Tue, Sep 2, 2025 at 4:15 AM Barry Song <21cnbao@gmail.com> wrote:
> > >
> > > On Sat, Aug 23, 2025 at 3:21 AM Kairui Song <ryncsn@gmail.com> wrote:
> > > >
> > > > From: Kairui Song <kasong@tencent.com>
> > > >
> > > > Now swap table is cluster based, which means free clusters can free its
> > > > table since no one should modify it.
> > > >
> > > > There could be speculative readers, like swap cache look up, protect
> > > > them by making them RCU safe. All swap table should be filled with null
> > > > entries before free, so such readers will either see a NULL pointer or
> > > > a null filled table being lazy freed.
> > > >
> > > > On allocation, allocate the table when a cluster is used by any order.
> > > >
> > >
> > > Might be a silly question.
> > >
> > > Just curious—what happens if the allocation fails? Does the swap-out
> > > operation also fail? We sometimes encounter strange issues when memory is
> > > very limited, especially if the reclamation path itself needs to allocate
> > > memory.
> > >
> > > Assume a case where we want to swap out a folio using clusterN. We then
> > > attempt to swap out the following folios with the same clusterN. But if
> > > the allocation of the swap_table keeps failing, what will happen?
> >
> > I think this is the same behavior as the XArray allocation node with no memory.
> > The swap allocator will fail to isolate this cluster, it gets a NULL
> > ci pointer as return value. The swap allocator will try other cluster
> > lists, e.g. non_full, fragment etc.
>
> What I’m actually concerned about is that we keep iterating on this
> cluster. If we try others, that sounds good.

No, the isolation of the current cluster will remove the cluster from
the head and eventually put it back to the tail of the appropriate
list. It will not keep iterating the same cluster. Otherwise trying to
allocate a high order swap entry will also deadlooping on the first
cluster if it fails to allocate swap entries.

>
> > If all of them fail, the folio_alloc_swap() will return -ENOMEM. Which
> > will propagate back to the try to swap out, then the shrink folio
> > list. It will put this page back to the LRU.
> >
> > The shrink folio list either free enough memory (happy path) or not
> > able to free enough memory and it will cause an OOM kill.
> >
> > I believe previously XArray will also return -ENOMEM at insert a
> > pointer and not be able to allocate a node to hold that ponter. It has
> > the same error poperation path. We did not change that.
>
> Yes, I agree there was an -ENOMEM, but the difference is that we
> are allocating much larger now :-)

Even that is not 100% true. The XArray uses kmem_cache. Most of the
time it is allocated from the kmem_cache cached page without hitting
the system page allocation. When kmem_cache runs out of the current
cached page, it will allocate from the system via page allocation, at
least page size.

So from the page allocator point of view, the swap table allocation is
not bigger either.

> One option is to organize every 4 or 8 swap slots into a group for
> allocating or freeing the swap table. This way, we avoid the worst
> case where a single unfreed slot consumes a whole swap table, and
> the allocation size also becomes smaller. However, it’s unclear
> whether the memory savings justify the added complexity and effort.

Keep in mind that XArray also has this fragmentation issue as well.
When a 64 pointer node is free, it will return to the kmem_cache as
free area of the cache page. Only when every object in that page is
free, that page can return to the page allocator. The difference is
that the unused area seating at the swap table can be used
immediately. The unused XArray node will sit in the kmem_cache and
need extra kmem_cache_alloc to get the node to be used in the XArray.
There is also a subtle difference that all xarray share the same
kmem_cache pool for all xarray users. There is no dedicated kmem_cache
pool for swap. The swap node might mix with other xarray nodes, making
it even harder to release the underlying page. The swap table uses the
page directly and it does not have this issue. If you have a swing of
batch jobs causing a lot of swap, when the job is done, those swap
entries will be free and the swap table can return those pages back.
But xarray might not be able to release as many pages because of the
mix usage of the xarray. It depends on what other xarray node was
allocated during the swap usage.

I guess that is too much detail.

>
> Anyway, I’m glad to see the current swap_table moving towards merge
> and look forward to running it on various devices. This should help
> us see if it causes any real issues.

Agree.

Chris

Re: [PATCH 8/9] mm, swap: implement dynamic allocation of swap table

Posted by Barry Song 1 month ago

On Wed, Sep 3, 2025 at 8:35 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Tue, Sep 2, 2025 at 4:31 PM Barry Song <21cnbao@gmail.com> wrote:
> >
> > On Wed, Sep 3, 2025 at 1:17 AM Chris Li <chrisl@kernel.org> wrote:
> > >
> > > On Tue, Sep 2, 2025 at 4:15 AM Barry Song <21cnbao@gmail.com> wrote:
> > > >
> > > > On Sat, Aug 23, 2025 at 3:21 AM Kairui Song <ryncsn@gmail.com> wrote:
> > > > >
> > > > > From: Kairui Song <kasong@tencent.com>
> > > > >
> > > > > Now swap table is cluster based, which means free clusters can free its
> > > > > table since no one should modify it.
> > > > >
> > > > > There could be speculative readers, like swap cache look up, protect
> > > > > them by making them RCU safe. All swap table should be filled with null
> > > > > entries before free, so such readers will either see a NULL pointer or
> > > > > a null filled table being lazy freed.
> > > > >
> > > > > On allocation, allocate the table when a cluster is used by any order.
> > > > >
> > > >
> > > > Might be a silly question.
> > > >
> > > > Just curious—what happens if the allocation fails? Does the swap-out
> > > > operation also fail? We sometimes encounter strange issues when memory is
> > > > very limited, especially if the reclamation path itself needs to allocate
> > > > memory.
> > > >
> > > > Assume a case where we want to swap out a folio using clusterN. We then
> > > > attempt to swap out the following folios with the same clusterN. But if
> > > > the allocation of the swap_table keeps failing, what will happen?
> > >
> > > I think this is the same behavior as the XArray allocation node with no memory.
> > > The swap allocator will fail to isolate this cluster, it gets a NULL
> > > ci pointer as return value. The swap allocator will try other cluster
> > > lists, e.g. non_full, fragment etc.
> >
> > What I’m actually concerned about is that we keep iterating on this
> > cluster. If we try others, that sounds good.
>
> No, the isolation of the current cluster will remove the cluster from
> the head and eventually put it back to the tail of the appropriate
> list. It will not keep iterating the same cluster. Otherwise trying to
> allocate a high order swap entry will also deadlooping on the first
> cluster if it fails to allocate swap entries.
>
> >
> > > If all of them fail, the folio_alloc_swap() will return -ENOMEM. Which
> > > will propagate back to the try to swap out, then the shrink folio
> > > list. It will put this page back to the LRU.
> > >
> > > The shrink folio list either free enough memory (happy path) or not
> > > able to free enough memory and it will cause an OOM kill.
> > >
> > > I believe previously XArray will also return -ENOMEM at insert a
> > > pointer and not be able to allocate a node to hold that ponter. It has
> > > the same error poperation path. We did not change that.
> >
> > Yes, I agree there was an -ENOMEM, but the difference is that we
> > are allocating much larger now :-)
>
> Even that is not 100% true. The XArray uses kmem_cache. Most of the
> time it is allocated from the kmem_cache cached page without hitting
> the system page allocation. When kmem_cache runs out of the current
> cached page, it will allocate from the system via page allocation, at
> least page size.
>

Exactly—that’s what I mean. When we hit the cache, allocation is far more
predictable than when it comes from the buddy allocator.

> So from the page allocator point of view, the swap table allocation is
> not bigger either.

I think the fundamental difference lies in how much pressure we place
on the buddy allocator.

>
> > One option is to organize every 4 or 8 swap slots into a group for
> > allocating or freeing the swap table. This way, we avoid the worst
> > case where a single unfreed slot consumes a whole swap table, and
> > the allocation size also becomes smaller. However, it’s unclear
> > whether the memory savings justify the added complexity and effort.
>
> Keep in mind that XArray also has this fragmentation issue as well.
> When a 64 pointer node is free, it will return to the kmem_cache as
> free area of the cache page. Only when every object in that page is
> free, that page can return to the page allocator. The difference is
> that the unused area seating at the swap table can be used
> immediately. The unused XArray node will sit in the kmem_cache and
> need extra kmem_cache_alloc to get the node to be used in the XArray.
> There is also a subtle difference that all xarray share the same
> kmem_cache pool for all xarray users. There is no dedicated kmem_cache
> pool for swap. The swap node might mix with other xarray nodes, making
> it even harder to release the underlying page. The swap table uses the
> page directly and it does not have this issue. If you have a swing of
> batch jobs causing a lot of swap, when the job is done, those swap
> entries will be free and the swap table can return those pages back.
> But xarray might not be able to release as many pages because of the
> mix usage of the xarray. It depends on what other xarray node was
> allocated during the swap usage.

Yes. If we organize the swap_table in group sizes of 16, 32, 64, 128, and so
on, we might gain the same benefit: those small objects become immediately
available to other allocations—no matter if they are visible to the buddy
allocator.

Anyway, I don’t have data to show whether the added complexity is worth
trying. I’m just glad the current approach is hoped to land and run on
real phones.

>
> I guess that is too much detail.
>
> >
> > Anyway, I’m glad to see the current swap_table moving towards merge
> > and look forward to running it on various devices. This should help
> > us see if it causes any real issues.
>

Thanks
Barry

Re: [PATCH 8/9] mm, swap: implement dynamic allocation of swap table

Posted by Chris Li 4 weeks, 1 day ago

On Wed, Sep 3, 2025 at 1:52 PM Barry Song <21cnbao@gmail.com> wrote:
>
> On Wed, Sep 3, 2025 at 8:35 PM Chris Li <chrisl@kernel.org> wrote:
> >
> > On Tue, Sep 2, 2025 at 4:31 PM Barry Song <21cnbao@gmail.com> wrote:
> > >
> > > On Wed, Sep 3, 2025 at 1:17 AM Chris Li <chrisl@kernel.org> wrote:
> > > >
> > > > On Tue, Sep 2, 2025 at 4:15 AM Barry Song <21cnbao@gmail.com> wrote:
> > > > >
> > > > > On Sat, Aug 23, 2025 at 3:21 AM Kairui Song <ryncsn@gmail.com> wrote:
> > > > > >
> > > > > > From: Kairui Song <kasong@tencent.com>
> > > > > >
> > > > > > Now swap table is cluster based, which means free clusters can free its
> > > > > > table since no one should modify it.
> > > > > >
> > > > > > There could be speculative readers, like swap cache look up, protect
> > > > > > them by making them RCU safe. All swap table should be filled with null
> > > > > > entries before free, so such readers will either see a NULL pointer or
> > > > > > a null filled table being lazy freed.
> > > > > >
> > > > > > On allocation, allocate the table when a cluster is used by any order.
> > > > > >
> > > > >
> > > > > Might be a silly question.
> > > > >
> > > > > Just curious—what happens if the allocation fails? Does the swap-out
> > > > > operation also fail? We sometimes encounter strange issues when memory is
> > > > > very limited, especially if the reclamation path itself needs to allocate
> > > > > memory.
> > > > >
> > > > > Assume a case where we want to swap out a folio using clusterN. We then
> > > > > attempt to swap out the following folios with the same clusterN. But if
> > > > > the allocation of the swap_table keeps failing, what will happen?
> > > >
> > > > I think this is the same behavior as the XArray allocation node with no memory.
> > > > The swap allocator will fail to isolate this cluster, it gets a NULL
> > > > ci pointer as return value. The swap allocator will try other cluster
> > > > lists, e.g. non_full, fragment etc.
> > >
> > > What I’m actually concerned about is that we keep iterating on this
> > > cluster. If we try others, that sounds good.
> >
> > No, the isolation of the current cluster will remove the cluster from
> > the head and eventually put it back to the tail of the appropriate
> > list. It will not keep iterating the same cluster. Otherwise trying to
> > allocate a high order swap entry will also deadlooping on the first
> > cluster if it fails to allocate swap entries.
> >
> > >
> > > > If all of them fail, the folio_alloc_swap() will return -ENOMEM. Which
> > > > will propagate back to the try to swap out, then the shrink folio
> > > > list. It will put this page back to the LRU.
> > > >
> > > > The shrink folio list either free enough memory (happy path) or not
> > > > able to free enough memory and it will cause an OOM kill.
> > > >
> > > > I believe previously XArray will also return -ENOMEM at insert a
> > > > pointer and not be able to allocate a node to hold that ponter. It has
> > > > the same error poperation path. We did not change that.
> > >
> > > Yes, I agree there was an -ENOMEM, but the difference is that we
> > > are allocating much larger now :-)
> >
> > Even that is not 100% true. The XArray uses kmem_cache. Most of the
> > time it is allocated from the kmem_cache cached page without hitting
> > the system page allocation. When kmem_cache runs out of the current
> > cached page, it will allocate from the system via page allocation, at
> > least page size.
> >
>
> Exactly—that’s what I mean. When we hit the cache, allocation is far more
> predictable than when it comes from the buddy allocator.

That statement is true if the number of allocations is the same.
However, because the xarray node size is 64, xarray needs to be
allocated a lot more often than swap tables which is page size.

From the page allocator point of view, these two should be similar.
Basically every 512 swap entry allocates one page from the page
allocator.

> > So from the page allocator point of view, the swap table allocation is
> > not bigger either.
>
> I think the fundamental difference lies in how much pressure we place
> on the buddy allocator.

Should be about the same. About every 512 swap entry allocates a page.
That does not consider xarray has an internal node as well. Can you
help me understand why you think xarray has less allocation pressure?

> > > One option is to organize every 4 or 8 swap slots into a group for
> > > allocating or freeing the swap table. This way, we avoid the worst
> > > case where a single unfreed slot consumes a whole swap table, and
> > > the allocation size also becomes smaller. However, it’s unclear
> > > whether the memory savings justify the added complexity and effort.
> >
> > Keep in mind that XArray also has this fragmentation issue as well.
> > When a 64 pointer node is free, it will return to the kmem_cache as
> > free area of the cache page. Only when every object in that page is
> > free, that page can return to the page allocator. The difference is
> > that the unused area seating at the swap table can be used
> > immediately. The unused XArray node will sit in the kmem_cache and
> > need extra kmem_cache_alloc to get the node to be used in the XArray.
> > There is also a subtle difference that all xarray share the same
> > kmem_cache pool for all xarray users. There is no dedicated kmem_cache
> > pool for swap. The swap node might mix with other xarray nodes, making
> > it even harder to release the underlying page. The swap table uses the
> > page directly and it does not have this issue. If you have a swing of
> > batch jobs causing a lot of swap, when the job is done, those swap
> > entries will be free and the swap table can return those pages back.
> > But xarray might not be able to release as many pages because of the
> > mix usage of the xarray. It depends on what other xarray node was
> > allocated during the swap usage.
>
> Yes. If we organize the swap_table in group sizes of 16, 32, 64, 128, and so
> on, we might gain the same benefit: those small objects become immediately
> available to other allocations—no matter if they are visible to the buddy
> allocator.

The swap table is page sized. One cluster still has 512 entries, If
you make the swap_table smaller, then you need to have more of
swap_table for one cluster. The swap table for one cluster needs to
add up to 512 entries anyway. Smaller size swap table does not make
sense to me.

Chris

Re: [PATCH 8/9] mm, swap: implement dynamic allocation of swap table

Posted by Kairui Song 1 month ago

Barry Song <21cnbao@gmail.com> 于 2025年9月3日周三 08:03写道：
>
> On Wed, Sep 3, 2025 at 1:17 AM Chris Li <chrisl@kernel.org> wrote:
> >
> > On Tue, Sep 2, 2025 at 4:15 AM Barry Song <21cnbao@gmail.com> wrote:
> > >
> > > On Sat, Aug 23, 2025 at 3:21 AM Kairui Song <ryncsn@gmail.com> wrote:
> > > >
> > > > From: Kairui Song <kasong@tencent.com>
> > > >
> > > > Now swap table is cluster based, which means free clusters can free its
> > > > table since no one should modify it.
> > > >
> > > > There could be speculative readers, like swap cache look up, protect
> > > > them by making them RCU safe. All swap table should be filled with null
> > > > entries before free, so such readers will either see a NULL pointer or
> > > > a null filled table being lazy freed.
> > > >
> > > > On allocation, allocate the table when a cluster is used by any order.
> > > >
> > >
> > > Might be a silly question.
> > >
> > > Just curious—what happens if the allocation fails? Does the swap-out
> > > operation also fail? We sometimes encounter strange issues when memory is
> > > very limited, especially if the reclamation path itself needs to allocate
> > > memory.
> > >
> > > Assume a case where we want to swap out a folio using clusterN. We then
> > > attempt to swap out the following folios with the same clusterN. But if
> > > the allocation of the swap_table keeps failing, what will happen?
> >
> > I think this is the same behavior as the XArray allocation node with no memory.
> > The swap allocator will fail to isolate this cluster, it gets a NULL
> > ci pointer as return value. The swap allocator will try other cluster
> > lists, e.g. non_full, fragment etc.
>
> What I’m actually concerned about is that we keep iterating on this
> cluster. If we try others, that sounds good.
>
> > If all of them fail, the folio_alloc_swap() will return -ENOMEM. Which
> > will propagate back to the try to swap out, then the shrink folio
> > list. It will put this page back to the LRU.
> >
> > The shrink folio list either free enough memory (happy path) or not
> > able to free enough memory and it will cause an OOM kill.
> >
> > I believe previously XArray will also return -ENOMEM at insert a
> > pointer and not be able to allocate a node to hold that ponter. It has
> > the same error poperation path. We did not change that.
>
> Yes, I agree there was an -ENOMEM, but the difference is that we
> are allocating much larger now :-)
>
> One option is to organize every 4 or 8 swap slots into a group for
> allocating or freeing the swap table. This way, we avoid the worst
> case where a single unfreed slot consumes a whole swap table, and
> the allocation size also becomes smaller. However, it’s unclear
> whether the memory savings justify the added complexity and effort.
>
> Anyway, I’m glad to see the current swap_table moving towards merge
> and look forward to running it on various devices. This should help
> us see if it causes any real issues.

Thanks for the insightful review.

I do plan to implement a shrinker to compact the swap table of idle /
full clusters when under pressure. It will be done at the very end.
Things will be much cleaner by then so it's easier to do. And
currently it seems the memory usage is quite good already.

>>
> Thanks
> Barry
>

Re: [PATCH 8/9] mm, swap: implement dynamic allocation of swap table

Posted by Kairui Song 1 month ago

On Tue, Sep 2, 2025 at 9:20 PM Chris Li <chrisl@kernel.org> wrote:
>
> On Tue, Sep 2, 2025 at 4:15 AM Barry Song <21cnbao@gmail.com> wrote:
> >
> > On Sat, Aug 23, 2025 at 3:21 AM Kairui Song <ryncsn@gmail.com> wrote:
> > >
> > > From: Kairui Song <kasong@tencent.com>
> > >
> > > Now swap table is cluster based, which means free clusters can free its
> > > table since no one should modify it.
> > >
> > > There could be speculative readers, like swap cache look up, protect
> > > them by making them RCU safe. All swap table should be filled with null
> > > entries before free, so such readers will either see a NULL pointer or
> > > a null filled table being lazy freed.
> > >
> > > On allocation, allocate the table when a cluster is used by any order.
> > >
> >
> > Might be a silly question.
> >
> > Just curious—what happens if the allocation fails? Does the swap-out
> > operation also fail? We sometimes encounter strange issues when memory is
> > very limited, especially if the reclamation path itself needs to allocate
> > memory.
> >
> > Assume a case where we want to swap out a folio using clusterN. We then
> > attempt to swap out the following folios with the same clusterN. But if
> > the allocation of the swap_table keeps failing, what will happen?
>
> I think this is the same behavior as the XArray allocation node with no memory.
> The swap allocator will fail to isolate this cluster, it gets a NULL
> ci pointer as return value. The swap allocator will try other cluster
> lists, e.g. non_full, fragment etc.
> If all of them fail, the folio_alloc_swap() will return -ENOMEM. Which
> will propagate back to the try to swap out, then the shrink folio
> list. It will put this page back to the LRU.
>
> The shrink folio list either free enough memory (happy path) or not
> able to free enough memory and it will cause an OOM kill.
>
> I believe previously XArray will also return -ENOMEM at insert a
> pointer and not be able to allocate a node to hold that ponter. It has
> the same error poperation path. We did not change that.

Yes, exactly. The overall behaviour is the same.

The allocation is only needed when a CPU's local swap cluster is
drained and swap allocator needs a new cluster. But after the previous
patch [1], many swap devices will prefer nonfull list. So the chance
that we need a swap table allocation is lower.

If it failed to allocate a swap table for a new cluster, it will try
fallback to frag / reclaim full. Only if all lists are drained,
folio_alloc_swap may fail with -ENOMEM and the caller (lru shink)
either try reclaim some other page or fail with OOM.

I think the fallback of nonfull / free / frag / reclaim-full might
even be helpful to avoid swapout failure when under heavy pressure. I
don't have data for that though, but I did run many test with heavy
pressure and didn't seen any issue.

Link: https://lore.kernel.org/linux-mm/20250812-swap-scan-list-v3-0-6d73504d267b@kernel.org/
[1]
>
> Chris
>

Re: [PATCH 8/9] mm, swap: implement dynamic allocation of swap table

Posted by Chris Li 1 month ago

Acked-by: Chris Li <chrisl@kernel.org>

Chris

PS, this version already has my feedback incorporated.

On Fri, Aug 22, 2025 at 12:21 PM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> Now swap table is cluster based, which means free clusters can free its
> table since no one should modify it.
>
> There could be speculative readers, like swap cache look up, protect
> them by making them RCU safe. All swap table should be filled with null
> entries before free, so such readers will either see a NULL pointer or
> a null filled table being lazy freed.
>
> On allocation, allocate the table when a cluster is used by any order.
>
> This way, we can reduce the memory usage of large swap device
> significantly.
>
> This idea to dynamically release unused swap cluster data was initially
> suggested by Chris Li while proposing the cluster swap allocator and
> I found it suits the swap table idea very well.
>
> Co-developed-by: Chris Li <chrisl@kernel.org>
> Signed-off-by: Chris Li <chrisl@kernel.org>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/swap.h       |   2 +-
>  mm/swap_state.c |   9 ++-
>  mm/swap_table.h |  32 +++++++-
>  mm/swapfile.c   | 202 ++++++++++++++++++++++++++++++++++++++----------
>  4 files changed, 197 insertions(+), 48 deletions(-)
>
> diff --git a/mm/swap.h b/mm/swap.h
> index ce3ec62cc05e..ee33733027f4 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -36,7 +36,7 @@ struct swap_cluster_info {
>         u16 count;
>         u8 flags;
>         u8 order;
> -       atomic_long_t *table;   /* Swap table entries, see mm/swap_table.h */
> +       atomic_long_t __rcu *table;     /* Swap table entries, see mm/swap_table.h */
>         struct list_head list;
>  };
>
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index c0342024b4a8..a0120d822fbe 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -87,7 +87,8 @@ struct folio *swap_cache_get_folio(swp_entry_t entry)
>         struct folio *folio;
>
>         for (;;) {
> -               swp_tb = __swap_table_get(swp_cluster(entry), swp_cluster_offset(entry));
> +               swp_tb = swap_table_get(swp_cluster(entry),
> +                                       swp_cluster_offset(entry));
>                 if (!swp_tb_is_folio(swp_tb))
>                         return NULL;
>                 folio = swp_tb_to_folio(swp_tb);
> @@ -107,10 +108,9 @@ void *swap_cache_get_shadow(swp_entry_t entry)
>  {
>         unsigned long swp_tb;
>
> -       swp_tb = __swap_table_get(swp_cluster(entry), swp_cluster_offset(entry));
> +       swp_tb = swap_table_get(swp_cluster(entry), swp_cluster_offset(entry));
>         if (swp_tb_is_shadow(swp_tb))
>                 return swp_tb_to_shadow(swp_tb);
> -
>         return NULL;
>  }
>
> @@ -135,6 +135,9 @@ int swap_cache_add_folio(swp_entry_t entry, struct folio *folio, void **shadowp)
>         VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio);
>
>         ci = swap_cluster_lock(swp_info(entry), swp_offset(entry));
> +       if (unlikely(!ci->table))
> +               goto fail;
> +
>         ci_start = swp_cluster_offset(entry);
>         ci_end = ci_start + nr_pages;
>         ci_off = ci_start;
> diff --git a/mm/swap_table.h b/mm/swap_table.h
> index ed9676547071..4e97513b11ef 100644
> --- a/mm/swap_table.h
> +++ b/mm/swap_table.h
> @@ -2,8 +2,15 @@
>  #ifndef _MM_SWAP_TABLE_H
>  #define _MM_SWAP_TABLE_H
>
> +#include <linux/rcupdate.h>
> +#include <linux/atomic.h>
>  #include "swap.h"
>
> +/* A typical flat array in each cluster as swap table */
> +struct swap_table {
> +       atomic_long_t entries[SWAPFILE_CLUSTER];
> +};
> +
>  /*
>   * A swap table entry represents the status of a swap slot on a swap
>   * (physical or virtual) device. The swap table in each cluster is a
> @@ -76,15 +83,36 @@ static inline void *swp_tb_to_shadow(unsigned long swp_tb)
>  static inline void __swap_table_set(struct swap_cluster_info *ci,
>                                     unsigned int off, unsigned long swp_tb)
>  {
> +       atomic_long_t *table = rcu_dereference_protected(ci->table, true);
> +
> +       lockdep_assert_held(&ci->lock);
>         VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
> -       atomic_long_set(&ci->table[off], swp_tb);
> +       atomic_long_set(&table[off], swp_tb);
>  }
>
>  static inline unsigned long __swap_table_get(struct swap_cluster_info *ci,
>                                              unsigned int off)
>  {
> +       atomic_long_t *table;
> +
>         VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER);
> -       return atomic_long_read(&ci->table[off]);
> +       table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock));
> +
> +       return atomic_long_read(&table[off]);
> +}
> +
> +static inline unsigned long swap_table_get(struct swap_cluster_info *ci,
> +                                       unsigned int off)
> +{
> +       atomic_long_t *table;
> +       unsigned long swp_tb;
> +
> +       rcu_read_lock();
> +       table = rcu_dereference(ci->table);
> +       swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb();
> +       rcu_read_unlock();
> +
> +       return swp_tb;
>  }
>
>  static inline void __swap_table_set_folio(struct swap_cluster_info *ci,
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 0c8001c99f30..00651e947eb2 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -105,6 +105,8 @@ static DEFINE_SPINLOCK(swap_avail_lock);
>
>  struct swap_info_struct *swap_info[MAX_SWAPFILES];
>
> +static struct kmem_cache *swap_table_cachep;
> +
>  static DEFINE_MUTEX(swapon_mutex);
>
>  static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
> @@ -402,10 +404,17 @@ static inline bool cluster_is_discard(struct swap_cluster_info *info)
>         return info->flags == CLUSTER_FLAG_DISCARD;
>  }
>
> +static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci)
> +{
> +       return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock));
> +}
> +
>  static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
>  {
>         if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
>                 return false;
> +       if (!cluster_table_is_alloced(ci))
> +               return false;
>         if (!order)
>                 return true;
>         return cluster_is_empty(ci) || order == ci->order;
> @@ -423,32 +432,98 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si,
>         return cluster_index(si, ci) * SWAPFILE_CLUSTER;
>  }
>
> -static int swap_table_alloc_table(struct swap_cluster_info *ci)
> +static void swap_cluster_free_table(struct swap_cluster_info *ci)
>  {
> -       WARN_ON(ci->table);
> -       ci->table = kzalloc(sizeof(unsigned long) * SWAPFILE_CLUSTER, GFP_KERNEL);
> -       if (!ci->table)
> -               return -ENOMEM;
> -       return 0;
> +       unsigned int ci_off;
> +       struct swap_table *table;
> +
> +       /* Only empty cluster's table is allow to be freed  */
> +       lockdep_assert_held(&ci->lock);
> +       VM_WARN_ON_ONCE(!cluster_is_empty(ci));
> +       for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++)
> +               VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off)));
> +       table = (void *)rcu_dereference_protected(ci->table, true);
> +       rcu_assign_pointer(ci->table, NULL);
> +
> +       kmem_cache_free(swap_table_cachep, table);
>  }
>
> -static void swap_cluster_free_table(struct swap_cluster_info *ci)
> +/*
> + * Allocate a swap table may need to sleep, which leads to migration,
> + * so attempt an atomic allocation first then fallback and handle
> + * potential race.
> + */
> +static struct swap_cluster_info *
> +swap_cluster_alloc_table(struct swap_info_struct *si,
> +                        struct swap_cluster_info *ci,
> +                        int order)
>  {
> -       unsigned int ci_off;
> -       unsigned long swp_tb;
> +       struct swap_cluster_info *pcp_ci;
> +       struct swap_table *table;
> +       unsigned long offset;
>
> -       if (!ci->table)
> -               return;
> +       /*
> +        * Only cluster isolation from the allocator does table allocation.
> +        * Swap allocator uses a percpu cluster and holds the local lock.
> +        */
> +       lockdep_assert_held(&ci->lock);
> +       lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
> +
> +       table = kmem_cache_zalloc(swap_table_cachep,
> +                                 __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
> +       if (table) {
> +               rcu_assign_pointer(ci->table, table);
> +               return ci;
> +       }
> +
> +       /*
> +        * Try a sleep allocation. Each isolated free cluster may cause
> +        * a sleep allocation, but there is a limited number of them, so
> +        * the potential recursive allocation should be limited.
> +        */
> +       spin_unlock(&ci->lock);
> +       if (!(si->flags & SWP_SOLIDSTATE))
> +               spin_unlock(&si->global_cluster_lock);
> +       local_unlock(&percpu_swap_cluster.lock);
> +       table = kmem_cache_zalloc(swap_table_cachep, __GFP_HIGH | GFP_KERNEL);
>
> -       for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) {
> -               swp_tb = __swap_table_get(ci, ci_off);
> -               if (!swp_tb_is_null(swp_tb))
> -                       pr_err_once("swap: unclean swap space on swapoff: 0x%lx",
> -                                   swp_tb);
> +       local_lock(&percpu_swap_cluster.lock);
> +       if (!(si->flags & SWP_SOLIDSTATE))
> +               spin_lock(&si->global_cluster_lock);
> +       /*
> +        * Back to atomic context. First, check if we migrated to a new
> +        * CPU with a usable percpu cluster. If so, try using that instead.
> +        * No need to check it for the spinning device, as swap is
> +        * serialized by the global lock on them.
> +        *
> +        * The is_usable check is a bit rough, but ensures order 0 success.
> +        */
> +       offset = this_cpu_read(percpu_swap_cluster.offset[order]);
> +       if ((si->flags & SWP_SOLIDSTATE) && offset) {
> +               pcp_ci = swap_cluster_lock(si, offset);
> +               if (cluster_is_usable(pcp_ci, order) &&
> +                   pcp_ci->count < SWAPFILE_CLUSTER) {
> +                       ci = pcp_ci;
> +                       goto free_table;
> +               }
> +               swap_cluster_unlock(pcp_ci);
>         }
>
> -       kfree(ci->table);
> -       ci->table = NULL;
> +       if (!table)
> +               return NULL;
> +
> +       spin_lock(&ci->lock);
> +       /* Nothing should have touched the dangling empty cluster. */
> +       if (WARN_ON_ONCE(cluster_table_is_alloced(ci)))
> +               goto free_table;
> +
> +       rcu_assign_pointer(ci->table, table);
> +       return ci;
> +
> +free_table:
> +       if (table)
> +               kmem_cache_free(swap_table_cachep, table);
> +       return ci;
>  }
>
>  static void move_cluster(struct swap_info_struct *si,
> @@ -480,7 +555,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
>
>  static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
>  {
> -       lockdep_assert_held(&ci->lock);
> +       swap_cluster_free_table(ci);
>         move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
>         ci->order = 0;
>  }
> @@ -495,15 +570,11 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info
>   * this returns NULL for an non-empty list.
>   */
>  static struct swap_cluster_info *isolate_lock_cluster(
> -               struct swap_info_struct *si, struct list_head *list)
> +               struct swap_info_struct *si, struct list_head *list, int order)
>  {
> -       struct swap_cluster_info *ci, *ret = NULL;
> +       struct swap_cluster_info *ci, *found = NULL;
>
>         spin_lock(&si->lock);
> -
> -       if (unlikely(!(si->flags & SWP_WRITEOK)))
> -               goto out;
> -
>         list_for_each_entry(ci, list, list) {
>                 if (!spin_trylock(&ci->lock))
>                         continue;
> @@ -515,13 +586,19 @@ static struct swap_cluster_info *isolate_lock_cluster(
>
>                 list_del(&ci->list);
>                 ci->flags = CLUSTER_FLAG_NONE;
> -               ret = ci;
> +               found = ci;
>                 break;
>         }
> -out:
>         spin_unlock(&si->lock);
>
> -       return ret;
> +       if (found && !cluster_table_is_alloced(found)) {
> +               /* Only an empty free cluster's swap table can be freed. */
> +               VM_WARN_ON_ONCE(list != &si->free_clusters);
> +               VM_WARN_ON_ONCE(!cluster_is_empty(found));
> +               return swap_cluster_alloc_table(si, found, order);
> +       }
> +
> +       return found;
>  }
>
>  /*
> @@ -654,17 +731,27 @@ static void relocate_cluster(struct swap_info_struct *si,
>   * added to free cluster list and its usage counter will be increased by 1.
>   * Only used for initialization.
>   */
> -static void inc_cluster_info_page(struct swap_info_struct *si,
> +static int inc_cluster_info_page(struct swap_info_struct *si,
>         struct swap_cluster_info *cluster_info, unsigned long page_nr)
>  {
>         unsigned long idx = page_nr / SWAPFILE_CLUSTER;
> +       struct swap_table *table;
>         struct swap_cluster_info *ci;
>
>         ci = cluster_info + idx;
> +       if (!ci->table) {
> +               table = kmem_cache_zalloc(swap_table_cachep, GFP_KERNEL);
> +               if (!table)
> +                       return -ENOMEM;
> +               rcu_assign_pointer(ci->table, table);
> +       }
> +
>         ci->count++;
>
>         VM_BUG_ON(ci->count > SWAPFILE_CLUSTER);
>         VM_BUG_ON(ci->flags);
> +
> +       return 0;
>  }
>
>  static bool cluster_reclaim_range(struct swap_info_struct *si,
> @@ -845,7 +932,7 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si,
>         unsigned int found = SWAP_ENTRY_INVALID;
>
>         do {
> -               struct swap_cluster_info *ci = isolate_lock_cluster(si, list);
> +               struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order);
>                 unsigned long offset;
>
>                 if (!ci)
> @@ -870,7 +957,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
>         if (force)
>                 to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;
>
> -       while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
> +       while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) {
>                 offset = cluster_offset(si, ci);
>                 end = min(si->max, offset + SWAPFILE_CLUSTER);
>                 to_scan--;
> @@ -1018,6 +1105,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
>  done:
>         if (!(si->flags & SWP_SOLIDSTATE))
>                 spin_unlock(&si->global_cluster_lock);
> +
>         return found;
>  }
>
> @@ -1885,7 +1973,13 @@ swp_entry_t get_swap_page_of_type(int type)
>         /* This is called for allocating swap entry, not cache */
>         if (get_swap_device_info(si)) {
>                 if (si->flags & SWP_WRITEOK) {
> +                       /*
> +                        * Grab the local lock to be complaint
> +                        * with swap table allocation.
> +                        */
> +                       local_lock(&percpu_swap_cluster.lock);
>                         offset = cluster_alloc_swap_entry(si, 0, 1);
> +                       local_unlock(&percpu_swap_cluster.lock);
>                         if (offset) {
>                                 entry = swp_entry(si->type, offset);
>                                 atomic_long_dec(&nr_swap_pages);
> @@ -2678,12 +2772,21 @@ static void wait_for_allocation(struct swap_info_struct *si)
>  static void free_cluster_info(struct swap_cluster_info *cluster_info,
>                               unsigned long maxpages)
>  {
> +       struct swap_cluster_info *ci;
>         int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
>
>         if (!cluster_info)
>                 return;
> -       for (i = 0; i < nr_clusters; i++)
> -               swap_cluster_free_table(&cluster_info[i]);
> +       for (i = 0; i < nr_clusters; i++) {
> +               ci = cluster_info + i;
> +               /* Cluster with bad marks count will have a remaining table */
> +               spin_lock(&ci->lock);
> +               if (rcu_dereference_protected(ci->table, true)) {
> +                       ci->count = 0;
> +                       swap_cluster_free_table(ci);
> +               }
> +               spin_unlock(&ci->lock);
> +       }
>         kvfree(cluster_info);
>  }
>
> @@ -2719,6 +2822,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>         struct address_space *mapping;
>         struct inode *inode;
>         struct filename *pathname;
> +       unsigned int maxpages;
>         int err, found = 0;
>
>         if (!capable(CAP_SYS_ADMIN))
> @@ -2825,8 +2929,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>         p->swap_map = NULL;
>         zeromap = p->zeromap;
>         p->zeromap = NULL;
> +       maxpages = p->max;
>         cluster_info = p->cluster_info;
> -       free_cluster_info(cluster_info, p->max);
>         p->max = 0;
>         p->cluster_info = NULL;
>         spin_unlock(&p->lock);
> @@ -2838,6 +2942,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>         p->global_cluster = NULL;
>         vfree(swap_map);
>         kvfree(zeromap);
> +       free_cluster_info(cluster_info, maxpages);
>         /* Destroy swap account information */
>         swap_cgroup_swapoff(p->type);
>
> @@ -3216,11 +3321,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
>         if (!cluster_info)
>                 goto err;
>
> -       for (i = 0; i < nr_clusters; i++) {
> +       for (i = 0; i < nr_clusters; i++)
>                 spin_lock_init(&cluster_info[i].lock);
> -               if (swap_table_alloc_table(&cluster_info[i]))
> -                       goto err_free;
> -       }
>
>         if (!(si->flags & SWP_SOLIDSTATE)) {
>                 si->global_cluster = kmalloc(sizeof(*si->global_cluster),
> @@ -3239,16 +3341,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
>          * See setup_swap_map(): header page, bad pages,
>          * and the EOF part of the last cluster.
>          */
> -       inc_cluster_info_page(si, cluster_info, 0);
> +       err = inc_cluster_info_page(si, cluster_info, 0);
> +       if (err)
> +               goto err;
>         for (i = 0; i < swap_header->info.nr_badpages; i++) {
>                 unsigned int page_nr = swap_header->info.badpages[i];
>
>                 if (page_nr >= maxpages)
>                         continue;
> -               inc_cluster_info_page(si, cluster_info, page_nr);
> +               err = inc_cluster_info_page(si, cluster_info, page_nr);
> +               if (err)
> +                       goto err;
> +       }
> +       for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) {
> +               err = inc_cluster_info_page(si, cluster_info, i);
> +               if (err)
> +                       goto err;
>         }
> -       for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
> -               inc_cluster_info_page(si, cluster_info, i);
>
>         INIT_LIST_HEAD(&si->free_clusters);
>         INIT_LIST_HEAD(&si->full_clusters);
> @@ -3962,6 +4071,15 @@ static int __init swapfile_init(void)
>
>         swapfile_maximum_size = arch_max_swapfile_size();
>
> +       /*
> +        * Once a cluster is freed, it's swap table content is read
> +        * only, and all swap cache readers (swap_cache_*) verifies
> +        * the content before use. So it's safe to use RCU slab here.
> +        */
> +       swap_table_cachep = kmem_cache_create("swap_table",
> +                           sizeof(struct swap_table),
> +                           0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL);
> +
>  #ifdef CONFIG_MIGRATION
>         if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
>                 swap_migration_ad_supported = true;
> --
> 2.51.0
>