[v2] mm, swap: introduce swap table as swap cache (phase I)

[PATCH v2 15/15] mm, swap: use a single page for swap table when the size fits

Posted by Kairui Song 4 days, 10 hours ago

From: Kairui Song <kasong@tencent.com>

We have a cluster size of 512 slots. Each slot consumes 8 bytes in swap
table so the swap table size of each cluster is exactly one page (4K).

If that condition is true, allocate one page direct and disable the slab
cache to reduce the memory usage of swap table and avoid fragmentation.

Co-developed-by: Chris Li <chrisl@kernel.org>
Signed-off-by: Chris Li <chrisl@kernel.org>
Signed-off-by: Kairui Song <kasong@tencent.com>
Acked-by: Chris Li <chrisl@kernel.org>
---
 mm/swap_table.h |  2 ++
 mm/swapfile.c   | 50 ++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/mm/swap_table.h b/mm/swap_table.h
index 52254e455304..ea244a57a5b7 100644
--- a/mm/swap_table.h
+++ b/mm/swap_table.h
@@ -11,6 +11,8 @@ struct swap_table {
 	atomic_long_t entries[SWAPFILE_CLUSTER];
 };
 
+#define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE)
+
 /*
  * A swap table entry represents the status of a swap slot on a swap
  * (physical or virtual) device. The swap table in each cluster is a
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 49f93069faef..ab6e877b0644 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -430,6 +430,38 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si,
 	return cluster_index(si, ci) * SWAPFILE_CLUSTER;
 }
 
+static struct swap_table *swap_table_alloc(gfp_t gfp)
+{
+	struct folio *folio;
+
+	if (!SWP_TABLE_USE_PAGE)
+		return kmem_cache_zalloc(swap_table_cachep, gfp);
+
+	folio = folio_alloc(gfp | __GFP_ZERO, 0);
+	if (folio)
+		return folio_address(folio);
+	return NULL;
+}
+
+static void swap_table_free_folio_rcu_cb(struct rcu_head *head)
+{
+	struct folio *folio;
+
+	folio = page_folio(container_of(head, struct page, rcu_head));
+	folio_put(folio);
+}
+
+static void swap_table_free(struct swap_table *table)
+{
+	if (!SWP_TABLE_USE_PAGE) {
+		kmem_cache_free(swap_table_cachep, table);
+		return;
+	}
+
+	call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head),
+		 swap_table_free_folio_rcu_cb);
+}
+
 static void swap_cluster_free_table(struct swap_cluster_info *ci)
 {
 	unsigned int ci_off;
@@ -443,7 +475,7 @@ static void swap_cluster_free_table(struct swap_cluster_info *ci)
 	table = (void *)rcu_dereference_protected(ci->table, true);
 	rcu_assign_pointer(ci->table, NULL);
 
-	kmem_cache_free(swap_table_cachep, table);
+	swap_table_free(table);
 }
 
 /*
@@ -467,8 +499,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
 	lockdep_assert_held(&ci->lock);
 	lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
 
-	table = kmem_cache_zalloc(swap_table_cachep,
-				  __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
+	table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
 	if (table) {
 		rcu_assign_pointer(ci->table, table);
 		return ci;
@@ -483,7 +514,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
 	if (!(si->flags & SWP_SOLIDSTATE))
 		spin_unlock(&si->global_cluster_lock);
 	local_unlock(&percpu_swap_cluster.lock);
-	table = kmem_cache_zalloc(swap_table_cachep, __GFP_HIGH | GFP_KERNEL);
+	table = swap_table_alloc(__GFP_HIGH | GFP_KERNEL);
 
 	local_lock(&percpu_swap_cluster.lock);
 	if (!(si->flags & SWP_SOLIDSTATE))
@@ -520,7 +551,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
 
 free_table:
 	if (table)
-		kmem_cache_free(swap_table_cachep, table);
+		swap_table_free(table);
 	return ci;
 }
 
@@ -738,7 +769,7 @@ static int inc_cluster_info_page(struct swap_info_struct *si,
 
 	ci = cluster_info + idx;
 	if (!ci->table) {
-		table = kmem_cache_zalloc(swap_table_cachep, GFP_KERNEL);
+		table = swap_table_alloc(GFP_KERNEL);
 		if (!table)
 			return -ENOMEM;
 		rcu_assign_pointer(ci->table, table);
@@ -4075,9 +4106,10 @@ static int __init swapfile_init(void)
 	 * only, and all swap cache readers (swap_cache_*) verifies
 	 * the content before use. So it's safe to use RCU slab here.
 	 */
-	swap_table_cachep = kmem_cache_create("swap_table",
-			    sizeof(struct swap_table),
-			    0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL);
+	if (!SWP_TABLE_USE_PAGE)
+		swap_table_cachep = kmem_cache_create("swap_table",
+				    sizeof(struct swap_table),
+				    0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL);
 
 #ifdef CONFIG_MIGRATION
 	if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
-- 
2.51.0

Re: [PATCH v2 15/15] mm, swap: use a single page for swap table when the size fits

Posted by Chris Li 3 days, 13 hours ago

I did not notice new changes, anyway.

Acked-by: Chris Li <chrisl@kernel.org>

Chris

On Fri, Sep 5, 2025 at 12:15 PM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> We have a cluster size of 512 slots. Each slot consumes 8 bytes in swap
> table so the swap table size of each cluster is exactly one page (4K).
>
> If that condition is true, allocate one page direct and disable the slab
> cache to reduce the memory usage of swap table and avoid fragmentation.
>
> Co-developed-by: Chris Li <chrisl@kernel.org>
> Signed-off-by: Chris Li <chrisl@kernel.org>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> Acked-by: Chris Li <chrisl@kernel.org>
> ---
>  mm/swap_table.h |  2 ++
>  mm/swapfile.c   | 50 ++++++++++++++++++++++++++++++++++++++++---------
>  2 files changed, 43 insertions(+), 9 deletions(-)
>
> diff --git a/mm/swap_table.h b/mm/swap_table.h
> index 52254e455304..ea244a57a5b7 100644
> --- a/mm/swap_table.h
> +++ b/mm/swap_table.h
> @@ -11,6 +11,8 @@ struct swap_table {
>         atomic_long_t entries[SWAPFILE_CLUSTER];
>  };
>
> +#define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE)
> +
>  /*
>   * A swap table entry represents the status of a swap slot on a swap
>   * (physical or virtual) device. The swap table in each cluster is a
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 49f93069faef..ab6e877b0644 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -430,6 +430,38 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si,
>         return cluster_index(si, ci) * SWAPFILE_CLUSTER;
>  }
>
> +static struct swap_table *swap_table_alloc(gfp_t gfp)
> +{
> +       struct folio *folio;
> +
> +       if (!SWP_TABLE_USE_PAGE)
> +               return kmem_cache_zalloc(swap_table_cachep, gfp);
> +
> +       folio = folio_alloc(gfp | __GFP_ZERO, 0);
> +       if (folio)
> +               return folio_address(folio);
> +       return NULL;
> +}
> +
> +static void swap_table_free_folio_rcu_cb(struct rcu_head *head)
> +{
> +       struct folio *folio;
> +
> +       folio = page_folio(container_of(head, struct page, rcu_head));
> +       folio_put(folio);
> +}
> +
> +static void swap_table_free(struct swap_table *table)
> +{
> +       if (!SWP_TABLE_USE_PAGE) {
> +               kmem_cache_free(swap_table_cachep, table);
> +               return;
> +       }
> +
> +       call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head),
> +                swap_table_free_folio_rcu_cb);
> +}
> +
>  static void swap_cluster_free_table(struct swap_cluster_info *ci)
>  {
>         unsigned int ci_off;
> @@ -443,7 +475,7 @@ static void swap_cluster_free_table(struct swap_cluster_info *ci)
>         table = (void *)rcu_dereference_protected(ci->table, true);
>         rcu_assign_pointer(ci->table, NULL);
>
> -       kmem_cache_free(swap_table_cachep, table);
> +       swap_table_free(table);
>  }
>
>  /*
> @@ -467,8 +499,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
>         lockdep_assert_held(&ci->lock);
>         lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock);
>
> -       table = kmem_cache_zalloc(swap_table_cachep,
> -                                 __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
> +       table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
>         if (table) {
>                 rcu_assign_pointer(ci->table, table);
>                 return ci;
> @@ -483,7 +514,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
>         if (!(si->flags & SWP_SOLIDSTATE))
>                 spin_unlock(&si->global_cluster_lock);
>         local_unlock(&percpu_swap_cluster.lock);
> -       table = kmem_cache_zalloc(swap_table_cachep, __GFP_HIGH | GFP_KERNEL);
> +       table = swap_table_alloc(__GFP_HIGH | GFP_KERNEL);
>
>         local_lock(&percpu_swap_cluster.lock);
>         if (!(si->flags & SWP_SOLIDSTATE))
> @@ -520,7 +551,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si,
>
>  free_table:
>         if (table)
> -               kmem_cache_free(swap_table_cachep, table);
> +               swap_table_free(table);
>         return ci;
>  }
>
> @@ -738,7 +769,7 @@ static int inc_cluster_info_page(struct swap_info_struct *si,
>
>         ci = cluster_info + idx;
>         if (!ci->table) {
> -               table = kmem_cache_zalloc(swap_table_cachep, GFP_KERNEL);
> +               table = swap_table_alloc(GFP_KERNEL);
>                 if (!table)
>                         return -ENOMEM;
>                 rcu_assign_pointer(ci->table, table);
> @@ -4075,9 +4106,10 @@ static int __init swapfile_init(void)
>          * only, and all swap cache readers (swap_cache_*) verifies
>          * the content before use. So it's safe to use RCU slab here.
>          */
> -       swap_table_cachep = kmem_cache_create("swap_table",
> -                           sizeof(struct swap_table),
> -                           0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL);
> +       if (!SWP_TABLE_USE_PAGE)
> +               swap_table_cachep = kmem_cache_create("swap_table",
> +                                   sizeof(struct swap_table),
> +                                   0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL);
>
>  #ifdef CONFIG_MIGRATION
>         if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
> --
> 2.51.0
>
>