[v1] mm, swap: introduce swap table as swap cache (phase I)

[PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by Kairui Song 1 month, 1 week ago

From: Kairui Song <kasong@tencent.com>

Always use swap_cache_get_folio for swap cache folio look up. The reason
we are not using it in all places is that it also updates the readahead
info, and some callsites want to avoid that.

So decouple readahead update with swap cache lookup into a standalone
helper, let the caller call the readahead update helper if that's
needed. And convert all swap cache lookups to use swap_cache_get_folio.

After this commit, there are only three special cases for accessing swap
cache space now: huge memory splitting, migration and shmem replacing,
because they need to lock the Xarray. Following commits will wrap their
accesses to the swap cache too with special helpers.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/memory.c      |  6 ++-
 mm/mincore.c     |  3 +-
 mm/shmem.c       |  4 +-
 mm/swap.h        | 13 +++++--
 mm/swap_state.c  | 99 +++++++++++++++++++++++-------------------------
 mm/swapfile.c    | 11 +++---
 mm/userfaultfd.c |  5 +--
 7 files changed, 72 insertions(+), 69 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index d9de6c056179..10ef528a5f44 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4660,9 +4660,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	if (unlikely(!si))
 		goto out;
 
-	folio = swap_cache_get_folio(entry, vma, vmf->address);
-	if (folio)
+	folio = swap_cache_get_folio(entry);
+	if (folio) {
+		swap_update_readahead(folio, vma, vmf->address);
 		page = folio_file_page(folio, swp_offset(entry));
+	}
 	swapcache = folio;
 
 	if (!folio) {
diff --git a/mm/mincore.c b/mm/mincore.c
index 2f3e1816a30d..8ec4719370e1 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -76,8 +76,7 @@ static unsigned char mincore_swap(swp_entry_t entry, bool shmem)
 		if (!si)
 			return 0;
 	}
-	folio = filemap_get_entry(swap_address_space(entry),
-				  swap_cache_index(entry));
+	folio = swap_cache_get_folio(entry);
 	if (shmem)
 		put_swap_device(si);
 	/* The swap cache space contains either folio, shadow or NULL */
diff --git a/mm/shmem.c b/mm/shmem.c
index 13cc51df3893..e9d0d2784cd5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2354,7 +2354,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	}
 
 	/* Look it up and read it in.. */
-	folio = swap_cache_get_folio(swap, NULL, 0);
+	folio = swap_cache_get_folio(swap);
 	if (!folio) {
 		if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
 			/* Direct swapin skipping swap cache & readahead */
@@ -2379,6 +2379,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 			count_vm_event(PGMAJFAULT);
 			count_memcg_event_mm(fault_mm, PGMAJFAULT);
 		}
+	} else {
+		swap_update_readahead(folio, NULL, 0);
 	}
 
 	if (order > folio_order(folio)) {
diff --git a/mm/swap.h b/mm/swap.h
index 1ae44d4193b1..efb6d7ff9f30 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -62,8 +62,7 @@ void delete_from_swap_cache(struct folio *folio);
 void clear_shadow_from_swap_cache(int type, unsigned long begin,
 				  unsigned long end);
 void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
-struct folio *swap_cache_get_folio(swp_entry_t entry,
-		struct vm_area_struct *vma, unsigned long addr);
+struct folio *swap_cache_get_folio(swp_entry_t entry);
 struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		struct vm_area_struct *vma, unsigned long addr,
 		struct swap_iocb **plug);
@@ -74,6 +73,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
 		struct mempolicy *mpol, pgoff_t ilx);
 struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
 		struct vm_fault *vmf);
+void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
+			   unsigned long addr);
 
 static inline unsigned int folio_swap_flags(struct folio *folio)
 {
@@ -159,6 +160,11 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
 	return NULL;
 }
 
+static inline void swap_update_readahead(struct folio *folio,
+		struct vm_area_struct *vma, unsigned long addr)
+{
+}
+
 static inline int swap_writeout(struct folio *folio,
 		struct swap_iocb **swap_plug)
 {
@@ -169,8 +175,7 @@ static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entr
 {
 }
 
-static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
-		struct vm_area_struct *vma, unsigned long addr)
+static inline struct folio *swap_cache_get_folio(swp_entry_t entry)
 {
 	return NULL;
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 99513b74b5d8..ff9eb761a103 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -69,6 +69,21 @@ void show_swap_cache_info(void)
 	printk("Total swap = %lukB\n", K(total_swap_pages));
 }
 
+/*
+ * Lookup a swap entry in the swap cache. A found folio will be returned
+ * unlocked and with its refcount incremented.
+ *
+ * Caller must lock the swap device or hold a reference to keep it valid.
+ */
+struct folio *swap_cache_get_folio(swp_entry_t entry)
+{
+	struct folio *folio = filemap_get_folio(swap_address_space(entry),
+						swap_cache_index(entry));
+	if (!IS_ERR(folio))
+		return folio;
+	return NULL;
+}
+
 void *get_shadow_from_swap_cache(swp_entry_t entry)
 {
 	struct address_space *address_space = swap_address_space(entry);
@@ -273,54 +288,40 @@ static inline bool swap_use_vma_readahead(void)
 }
 
 /*
- * Lookup a swap entry in the swap cache. A found folio will be returned
- * unlocked and with its refcount incremented - we rely on the kernel
- * lock getting page table operations atomic even if we drop the folio
- * lock before returning.
- *
- * Caller must lock the swap device or hold a reference to keep it valid.
+ * Update the readahead statistics of a vma or globally.
  */
-struct folio *swap_cache_get_folio(swp_entry_t entry,
-		struct vm_area_struct *vma, unsigned long addr)
+void swap_update_readahead(struct folio *folio,
+			   struct vm_area_struct *vma,
+			   unsigned long addr)
 {
-	struct folio *folio;
-
-	folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
-	if (!IS_ERR(folio)) {
-		bool vma_ra = swap_use_vma_readahead();
-		bool readahead;
+	bool readahead, vma_ra = swap_use_vma_readahead();
 
-		/*
-		 * At the moment, we don't support PG_readahead for anon THP
-		 * so let's bail out rather than confusing the readahead stat.
-		 */
-		if (unlikely(folio_test_large(folio)))
-			return folio;
-
-		readahead = folio_test_clear_readahead(folio);
-		if (vma && vma_ra) {
-			unsigned long ra_val;
-			int win, hits;
-
-			ra_val = GET_SWAP_RA_VAL(vma);
-			win = SWAP_RA_WIN(ra_val);
-			hits = SWAP_RA_HITS(ra_val);
-			if (readahead)
-				hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
-			atomic_long_set(&vma->swap_readahead_info,
-					SWAP_RA_VAL(addr, win, hits));
-		}
-
-		if (readahead) {
-			count_vm_event(SWAP_RA_HIT);
-			if (!vma || !vma_ra)
-				atomic_inc(&swapin_readahead_hits);
-		}
-	} else {
-		folio = NULL;
+	/*
+	 * At the moment, we don't support PG_readahead for anon THP
+	 * so let's bail out rather than confusing the readahead stat.
+	 */
+	if (unlikely(folio_test_large(folio)))
+		return;
+
+	readahead = folio_test_clear_readahead(folio);
+	if (vma && vma_ra) {
+		unsigned long ra_val;
+		int win, hits;
+
+		ra_val = GET_SWAP_RA_VAL(vma);
+		win = SWAP_RA_WIN(ra_val);
+		hits = SWAP_RA_HITS(ra_val);
+		if (readahead)
+			hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
+		atomic_long_set(&vma->swap_readahead_info,
+				SWAP_RA_VAL(addr, win, hits));
 	}
 
-	return folio;
+	if (readahead) {
+		count_vm_event(SWAP_RA_HIT);
+		if (!vma || !vma_ra)
+			atomic_inc(&swapin_readahead_hits);
+	}
 }
 
 struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
@@ -336,14 +337,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	*new_page_allocated = false;
 	for (;;) {
 		int err;
-		/*
-		 * First check the swap cache.  Since this is normally
-		 * called after swap_cache_get_folio() failed, re-calling
-		 * that would confuse statistics.
-		 */
-		folio = filemap_get_folio(swap_address_space(entry),
-					  swap_cache_index(entry));
-		if (!IS_ERR(folio))
+
+		/* Check the swap cache in case the folio is already there */
+		folio = swap_cache_get_folio(entry);
+		if (folio)
 			goto got_folio;
 
 		/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a7ffabbe65ef..4b8ab2cb49ca 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -213,15 +213,14 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
 				 unsigned long offset, unsigned long flags)
 {
 	swp_entry_t entry = swp_entry(si->type, offset);
-	struct address_space *address_space = swap_address_space(entry);
 	struct swap_cluster_info *ci;
 	struct folio *folio;
 	int ret, nr_pages;
 	bool need_reclaim;
 
 again:
-	folio = filemap_get_folio(address_space, swap_cache_index(entry));
-	if (IS_ERR(folio))
+	folio = swap_cache_get_folio(entry);
+	if (!folio)
 		return 0;
 
 	nr_pages = folio_nr_pages(folio);
@@ -2131,7 +2130,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		pte_unmap(pte);
 		pte = NULL;
 
-		folio = swap_cache_get_folio(entry, vma, addr);
+		folio = swap_cache_get_folio(entry);
 		if (!folio) {
 			struct vm_fault vmf = {
 				.vma = vma,
@@ -2357,8 +2356,8 @@ static int try_to_unuse(unsigned int type)
 	       (i = find_next_to_unuse(si, i)) != 0) {
 
 		entry = swp_entry(type, i);
-		folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
-		if (IS_ERR(folio))
+		folio = swap_cache_get_folio(entry);
+		if (!folio)
 			continue;
 
 		/*
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 50aaa8dcd24c..af61b95c89e4 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1489,9 +1489,8 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd
 		 * separately to allow proper handling.
 		 */
 		if (!src_folio)
-			folio = filemap_get_folio(swap_address_space(entry),
-					swap_cache_index(entry));
-		if (!IS_ERR_OR_NULL(folio)) {
+			folio = swap_cache_get_folio(entry);
+		if (folio) {
 			if (folio_test_large(folio)) {
 				ret = -EBUSY;
 				folio_put(folio);
-- 
2.51.0

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by Nhat Pham 1 month ago

On Fri, Aug 22, 2025 at 12:20 PM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> Always use swap_cache_get_folio for swap cache folio look up. The reason
> we are not using it in all places is that it also updates the readahead
> info, and some callsites want to avoid that.
>
> So decouple readahead update with swap cache lookup into a standalone
> helper, let the caller call the readahead update helper if that's
> needed. And convert all swap cache lookups to use swap_cache_get_folio.
>
> After this commit, there are only three special cases for accessing swap
> cache space now: huge memory splitting, migration and shmem replacing,
> because they need to lock the Xarray. Following commits will wrap their
> accesses to the swap cache too with special helpers.
>
> Signed-off-by: Kairui Song <kasong@tencent.com>

My personal taste is we should have one patch to do the decoupling of
readahead update and the swapcache lookup, then another patch to use
the swapcache lookup helper in places that previously cannot be used
(due to the coupling of swapcache and readahead).

But, this looks good to me :) Feel free to add:

Acked-by: Nhat Pham <nphamcs@gmail.com>

> ---
>  mm/memory.c      |  6 ++-
>  mm/mincore.c     |  3 +-
>  mm/shmem.c       |  4 +-
>  mm/swap.h        | 13 +++++--
>  mm/swap_state.c  | 99 +++++++++++++++++++++++-------------------------
>  mm/swapfile.c    | 11 +++---
>  mm/userfaultfd.c |  5 +--
>  7 files changed, 72 insertions(+), 69 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index d9de6c056179..10ef528a5f44 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4660,9 +4660,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>         if (unlikely(!si))
>                 goto out;
>
> -       folio = swap_cache_get_folio(entry, vma, vmf->address);
> -       if (folio)
> +       folio = swap_cache_get_folio(entry);
> +       if (folio) {
> +               swap_update_readahead(folio, vma, vmf->address);
>                 page = folio_file_page(folio, swp_offset(entry));
> +       }
>         swapcache = folio;
>
>         if (!folio) {
> diff --git a/mm/mincore.c b/mm/mincore.c
> index 2f3e1816a30d..8ec4719370e1 100644
> --- a/mm/mincore.c
> +++ b/mm/mincore.c
> @@ -76,8 +76,7 @@ static unsigned char mincore_swap(swp_entry_t entry, bool shmem)
>                 if (!si)
>                         return 0;
>         }
> -       folio = filemap_get_entry(swap_address_space(entry),
> -                                 swap_cache_index(entry));
> +       folio = swap_cache_get_folio(entry);
>         if (shmem)
>                 put_swap_device(si);
>         /* The swap cache space contains either folio, shadow or NULL */
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 13cc51df3893..e9d0d2784cd5 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2354,7 +2354,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>         }
>
>         /* Look it up and read it in.. */
> -       folio = swap_cache_get_folio(swap, NULL, 0);
> +       folio = swap_cache_get_folio(swap);
>         if (!folio) {
>                 if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
>                         /* Direct swapin skipping swap cache & readahead */
> @@ -2379,6 +2379,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>                         count_vm_event(PGMAJFAULT);
>                         count_memcg_event_mm(fault_mm, PGMAJFAULT);
>                 }
> +       } else {
> +               swap_update_readahead(folio, NULL, 0);
>         }
>
>         if (order > folio_order(folio)) {
> diff --git a/mm/swap.h b/mm/swap.h
> index 1ae44d4193b1..efb6d7ff9f30 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -62,8 +62,7 @@ void delete_from_swap_cache(struct folio *folio);
>  void clear_shadow_from_swap_cache(int type, unsigned long begin,
>                                   unsigned long end);
>  void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
> -struct folio *swap_cache_get_folio(swp_entry_t entry,
> -               struct vm_area_struct *vma, unsigned long addr);
> +struct folio *swap_cache_get_folio(swp_entry_t entry);
>  struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>                 struct vm_area_struct *vma, unsigned long addr,
>                 struct swap_iocb **plug);
> @@ -74,6 +73,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
>                 struct mempolicy *mpol, pgoff_t ilx);
>  struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
>                 struct vm_fault *vmf);
> +void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
> +                          unsigned long addr);
>
>  static inline unsigned int folio_swap_flags(struct folio *folio)
>  {
> @@ -159,6 +160,11 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
>         return NULL;
>  }
>
> +static inline void swap_update_readahead(struct folio *folio,
> +               struct vm_area_struct *vma, unsigned long addr)
> +{
> +}
> +
>  static inline int swap_writeout(struct folio *folio,
>                 struct swap_iocb **swap_plug)
>  {
> @@ -169,8 +175,7 @@ static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entr
>  {
>  }
>
> -static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
> -               struct vm_area_struct *vma, unsigned long addr)
> +static inline struct folio *swap_cache_get_folio(swp_entry_t entry)
>  {
>         return NULL;
>  }
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 99513b74b5d8..ff9eb761a103 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -69,6 +69,21 @@ void show_swap_cache_info(void)
>         printk("Total swap = %lukB\n", K(total_swap_pages));
>  }
>
> +/*
> + * Lookup a swap entry in the swap cache. A found folio will be returned
> + * unlocked and with its refcount incremented.
> + *
> + * Caller must lock the swap device or hold a reference to keep it valid.
> + */
> +struct folio *swap_cache_get_folio(swp_entry_t entry)
> +{
> +       struct folio *folio = filemap_get_folio(swap_address_space(entry),
> +                                               swap_cache_index(entry));
> +       if (!IS_ERR(folio))
> +               return folio;
> +       return NULL;
> +}
> +
>  void *get_shadow_from_swap_cache(swp_entry_t entry)
>  {
>         struct address_space *address_space = swap_address_space(entry);
> @@ -273,54 +288,40 @@ static inline bool swap_use_vma_readahead(void)
>  }
>
>  /*
> - * Lookup a swap entry in the swap cache. A found folio will be returned
> - * unlocked and with its refcount incremented - we rely on the kernel
> - * lock getting page table operations atomic even if we drop the folio
> - * lock before returning.
> - *
> - * Caller must lock the swap device or hold a reference to keep it valid.
> + * Update the readahead statistics of a vma or globally.
>   */
> -struct folio *swap_cache_get_folio(swp_entry_t entry,
> -               struct vm_area_struct *vma, unsigned long addr)
> +void swap_update_readahead(struct folio *folio,
> +                          struct vm_area_struct *vma,
> +                          unsigned long addr)
>  {
> -       struct folio *folio;
> -
> -       folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
> -       if (!IS_ERR(folio)) {
> -               bool vma_ra = swap_use_vma_readahead();
> -               bool readahead;
> +       bool readahead, vma_ra = swap_use_vma_readahead();
>
> -               /*
> -                * At the moment, we don't support PG_readahead for anon THP
> -                * so let's bail out rather than confusing the readahead stat.
> -                */
> -               if (unlikely(folio_test_large(folio)))
> -                       return folio;
> -
> -               readahead = folio_test_clear_readahead(folio);
> -               if (vma && vma_ra) {
> -                       unsigned long ra_val;
> -                       int win, hits;
> -
> -                       ra_val = GET_SWAP_RA_VAL(vma);
> -                       win = SWAP_RA_WIN(ra_val);
> -                       hits = SWAP_RA_HITS(ra_val);
> -                       if (readahead)
> -                               hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
> -                       atomic_long_set(&vma->swap_readahead_info,
> -                                       SWAP_RA_VAL(addr, win, hits));
> -               }
> -
> -               if (readahead) {
> -                       count_vm_event(SWAP_RA_HIT);
> -                       if (!vma || !vma_ra)
> -                               atomic_inc(&swapin_readahead_hits);
> -               }
> -       } else {
> -               folio = NULL;
> +       /*
> +        * At the moment, we don't support PG_readahead for anon THP
> +        * so let's bail out rather than confusing the readahead stat.
> +        */
> +       if (unlikely(folio_test_large(folio)))
> +               return;
> +
> +       readahead = folio_test_clear_readahead(folio);
> +       if (vma && vma_ra) {
> +               unsigned long ra_val;
> +               int win, hits;
> +
> +               ra_val = GET_SWAP_RA_VAL(vma);
> +               win = SWAP_RA_WIN(ra_val);
> +               hits = SWAP_RA_HITS(ra_val);
> +               if (readahead)
> +                       hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
> +               atomic_long_set(&vma->swap_readahead_info,
> +                               SWAP_RA_VAL(addr, win, hits));
>         }
>
> -       return folio;
> +       if (readahead) {
> +               count_vm_event(SWAP_RA_HIT);
> +               if (!vma || !vma_ra)
> +                       atomic_inc(&swapin_readahead_hits);
> +       }
>  }
>
>  struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> @@ -336,14 +337,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>         *new_page_allocated = false;
>         for (;;) {
>                 int err;
> -               /*
> -                * First check the swap cache.  Since this is normally
> -                * called after swap_cache_get_folio() failed, re-calling
> -                * that would confuse statistics.
> -                */
> -               folio = filemap_get_folio(swap_address_space(entry),
> -                                         swap_cache_index(entry));
> -               if (!IS_ERR(folio))
> +
> +               /* Check the swap cache in case the folio is already there */
> +               folio = swap_cache_get_folio(entry);
> +               if (folio)
>                         goto got_folio;
>
>                 /*
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index a7ffabbe65ef..4b8ab2cb49ca 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -213,15 +213,14 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
>                                  unsigned long offset, unsigned long flags)
>  {
>         swp_entry_t entry = swp_entry(si->type, offset);
> -       struct address_space *address_space = swap_address_space(entry);
>         struct swap_cluster_info *ci;
>         struct folio *folio;
>         int ret, nr_pages;
>         bool need_reclaim;
>
>  again:
> -       folio = filemap_get_folio(address_space, swap_cache_index(entry));
> -       if (IS_ERR(folio))
> +       folio = swap_cache_get_folio(entry);
> +       if (!folio)
>                 return 0;
>
>         nr_pages = folio_nr_pages(folio);
> @@ -2131,7 +2130,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>                 pte_unmap(pte);
>                 pte = NULL;
>
> -               folio = swap_cache_get_folio(entry, vma, addr);
> +               folio = swap_cache_get_folio(entry);
>                 if (!folio) {
>                         struct vm_fault vmf = {
>                                 .vma = vma,
> @@ -2357,8 +2356,8 @@ static int try_to_unuse(unsigned int type)
>                (i = find_next_to_unuse(si, i)) != 0) {
>
>                 entry = swp_entry(type, i);
> -               folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
> -               if (IS_ERR(folio))
> +               folio = swap_cache_get_folio(entry);
> +               if (!folio)
>                         continue;
>
>                 /*
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 50aaa8dcd24c..af61b95c89e4 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -1489,9 +1489,8 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd
>                  * separately to allow proper handling.
>                  */
>                 if (!src_folio)
> -                       folio = filemap_get_folio(swap_address_space(entry),
> -                                       swap_cache_index(entry));
> -               if (!IS_ERR_OR_NULL(folio)) {
> +                       folio = swap_cache_get_folio(entry);
> +               if (folio) {
>                         if (folio_test_large(folio)) {
>                                 ret = -EBUSY;
>                                 folio_put(folio);
> --
> 2.51.0
>

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by Kairui Song 4 weeks, 1 day ago

On Thu, Sep 4, 2025 at 2:10 AM Nhat Pham <nphamcs@gmail.com> wrote:
>
> On Fri, Aug 22, 2025 at 12:20 PM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > Always use swap_cache_get_folio for swap cache folio look up. The reason
> > we are not using it in all places is that it also updates the readahead
> > info, and some callsites want to avoid that.
> >
> > So decouple readahead update with swap cache lookup into a standalone
> > helper, let the caller call the readahead update helper if that's
> > needed. And convert all swap cache lookups to use swap_cache_get_folio.
> >
> > After this commit, there are only three special cases for accessing swap
> > cache space now: huge memory splitting, migration and shmem replacing,
> > because they need to lock the Xarray. Following commits will wrap their
> > accesses to the swap cache too with special helpers.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
>
> My personal taste is we should have one patch to do the decoupling of
> readahead update and the swapcache lookup, then another patch to use
> the swapcache lookup helper in places that previously cannot be used
> (due to the coupling of swapcache and readahead).
>
> But, this looks good to me :) Feel free to add:
>
> Acked-by: Nhat Pham <nphamcs@gmail.com>
>

Thanks, I think I'll just drop the readahead update for now, it is
harmless to keep it out of the lock and it's really small change, I
didn't measure anything different either way.

We may test and figure out how to read ahead better later, not really
related to this series.

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by David Hildenbrand 1 month ago

On 22.08.25 21:20, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
> 
> Always use swap_cache_get_folio for swap cache folio look up. The reason
> we are not using it in all places is that it also updates the readahead
> info, and some callsites want to avoid that.
> 
> So decouple readahead update with swap cache lookup into a standalone
> helper, let the caller call the readahead update helper if that's
> needed. And convert all swap cache lookups to use swap_cache_get_folio.
> 
> After this commit, there are only three special cases for accessing swap
> cache space now: huge memory splitting, migration and shmem replacing,
> because they need to lock the Xarray. Following commits will wrap their
> accesses to the swap cache too with special helpers.
> 
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---



> +void swap_update_readahead(struct folio *folio,
> +			   struct vm_area_struct *vma,
> +			   unsigned long addr)
>   {

Oh, one thing. Regarding recent const-correctness discussions, "folio" 
should probably be const here.

-- 
Cheers

David / dhildenb

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by Kairui Song 1 month ago

On Tue, Sep 2, 2025 at 6:13 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 22.08.25 21:20, Kairui Song wrote:
> > From: Kairui Song <kasong@tencent.com>
> >
> > Always use swap_cache_get_folio for swap cache folio look up. The reason
> > we are not using it in all places is that it also updates the readahead
> > info, and some callsites want to avoid that.
> >
> > So decouple readahead update with swap cache lookup into a standalone
> > helper, let the caller call the readahead update helper if that's
> > needed. And convert all swap cache lookups to use swap_cache_get_folio.
> >
> > After this commit, there are only three special cases for accessing swap
> > cache space now: huge memory splitting, migration and shmem replacing,
> > because they need to lock the Xarray. Following commits will wrap their
> > accesses to the swap cache too with special helpers.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
>
>
>
> > +void swap_update_readahead(struct folio *folio,
> > +                        struct vm_area_struct *vma,
> > +                        unsigned long addr)
> >   {
>
> Oh, one thing. Regarding recent const-correctness discussions, "folio"
> should probably be const here.
>

Not here, swap_update_readahead does folio_test_clear_readahead so...

I'll try add const to other places where I see the folio is const,
thanks for the info!

> --
> Cheers
>
> David / dhildenb
>
>

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by David Hildenbrand 1 month ago

On 02.09.25 19:13, Kairui Song wrote:
> On Tue, Sep 2, 2025 at 6:13 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 22.08.25 21:20, Kairui Song wrote:
>>> From: Kairui Song <kasong@tencent.com>
>>>
>>> Always use swap_cache_get_folio for swap cache folio look up. The reason
>>> we are not using it in all places is that it also updates the readahead
>>> info, and some callsites want to avoid that.
>>>
>>> So decouple readahead update with swap cache lookup into a standalone
>>> helper, let the caller call the readahead update helper if that's
>>> needed. And convert all swap cache lookups to use swap_cache_get_folio.
>>>
>>> After this commit, there are only three special cases for accessing swap
>>> cache space now: huge memory splitting, migration and shmem replacing,
>>> because they need to lock the Xarray. Following commits will wrap their
>>> accesses to the swap cache too with special helpers.
>>>
>>> Signed-off-by: Kairui Song <kasong@tencent.com>
>>> ---
>>
>>
>>
>>> +void swap_update_readahead(struct folio *folio,
>>> +                        struct vm_area_struct *vma,
>>> +                        unsigned long addr)
>>>    {
>>
>> Oh, one thing. Regarding recent const-correctness discussions, "folio"
>> should probably be const here.
>>
> 
> Not here, swap_update_readahead does folio_test_clear_readahead so...
> 

Ah, makes sense!

> I'll try add const to other places where I see the folio is const,
> thanks for the info!


Thanks!


-- 
Cheers

David / dhildenb

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by David Hildenbrand 1 month ago

On 22.08.25 21:20, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
> 
> Always use swap_cache_get_folio for swap cache folio look up. The reason
> we are not using it in all places is that it also updates the readahead
> info, and some callsites want to avoid that.
> 
> So decouple readahead update with swap cache lookup into a standalone
> helper, let the caller call the readahead update helper if that's
> needed. And convert all swap cache lookups to use swap_cache_get_folio.
> 
> After this commit, there are only three special cases for accessing swap
> cache space now: huge memory splitting, migration and shmem replacing,
> because they need to lock the Xarray. Following commits will wrap their
> accesses to the swap cache too with special helpers.
> 
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>   mm/memory.c      |  6 ++-
>   mm/mincore.c     |  3 +-
>   mm/shmem.c       |  4 +-
>   mm/swap.h        | 13 +++++--
>   mm/swap_state.c  | 99 +++++++++++++++++++++++-------------------------
>   mm/swapfile.c    | 11 +++---
>   mm/userfaultfd.c |  5 +--
>   7 files changed, 72 insertions(+), 69 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index d9de6c056179..10ef528a5f44 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4660,9 +4660,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>   	if (unlikely(!si))
>   		goto out;
>   
> -	folio = swap_cache_get_folio(entry, vma, vmf->address);
> -	if (folio)
> +	folio = swap_cache_get_folio(entry);
> +	if (folio) {
> +		swap_update_readahead(folio, vma, vmf->address);
>   		page = folio_file_page(folio, swp_offset(entry));
> +	}
>   	swapcache = folio;
>   
>   	if (!folio) {
> diff --git a/mm/mincore.c b/mm/mincore.c
> index 2f3e1816a30d..8ec4719370e1 100644
> --- a/mm/mincore.c
> +++ b/mm/mincore.c
> @@ -76,8 +76,7 @@ static unsigned char mincore_swap(swp_entry_t entry, bool shmem)
>   		if (!si)
>   			return 0;
>   	}
> -	folio = filemap_get_entry(swap_address_space(entry),
> -				  swap_cache_index(entry));
> +	folio = swap_cache_get_folio(entry);
>   	if (shmem)
>   		put_swap_device(si);
>   	/* The swap cache space contains either folio, shadow or NULL */
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 13cc51df3893..e9d0d2784cd5 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2354,7 +2354,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>   	}
>   
>   	/* Look it up and read it in.. */
> -	folio = swap_cache_get_folio(swap, NULL, 0);
> +	folio = swap_cache_get_folio(swap);
>   	if (!folio) {
>   		if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
>   			/* Direct swapin skipping swap cache & readahead */
> @@ -2379,6 +2379,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>   			count_vm_event(PGMAJFAULT);
>   			count_memcg_event_mm(fault_mm, PGMAJFAULT);
>   		}
> +	} else {
> +		swap_update_readahead(folio, NULL, 0);
>   	}
>   
>   	if (order > folio_order(folio)) {
> diff --git a/mm/swap.h b/mm/swap.h
> index 1ae44d4193b1..efb6d7ff9f30 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -62,8 +62,7 @@ void delete_from_swap_cache(struct folio *folio);
>   void clear_shadow_from_swap_cache(int type, unsigned long begin,
>   				  unsigned long end);
>   void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
> -struct folio *swap_cache_get_folio(swp_entry_t entry,
> -		struct vm_area_struct *vma, unsigned long addr);
> +struct folio *swap_cache_get_folio(swp_entry_t entry);
>   struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>   		struct vm_area_struct *vma, unsigned long addr,
>   		struct swap_iocb **plug);
> @@ -74,6 +73,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
>   		struct mempolicy *mpol, pgoff_t ilx);
>   struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
>   		struct vm_fault *vmf);
> +void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
> +			   unsigned long addr);
>   
>   static inline unsigned int folio_swap_flags(struct folio *folio)
>   {
> @@ -159,6 +160,11 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
>   	return NULL;
>   }
>   
> +static inline void swap_update_readahead(struct folio *folio,
> +		struct vm_area_struct *vma, unsigned long addr)
> +{
> +}
> +
>   static inline int swap_writeout(struct folio *folio,
>   		struct swap_iocb **swap_plug)
>   {
> @@ -169,8 +175,7 @@ static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entr
>   {
>   }
>   
> -static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
> -		struct vm_area_struct *vma, unsigned long addr)
> +static inline struct folio *swap_cache_get_folio(swp_entry_t entry)
>   {
>   	return NULL;
>   }
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 99513b74b5d8..ff9eb761a103 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -69,6 +69,21 @@ void show_swap_cache_info(void)
>   	printk("Total swap = %lukB\n", K(total_swap_pages));
>   }
>   
> +/*

While at it, proper kerneldoc?

/**

etc.

Also documenting that it will only return a valid folio pointer or NULL

> + * Lookup a swap entry in the swap cache. A found folio will be returned
> + * unlocked and with its refcount incremented.
> + *
> + * Caller must lock the swap device or hold a reference to keep it valid.
> + */
> +struct folio *swap_cache_get_folio(swp_entry_t entry)
> +{
> +	struct folio *folio = filemap_get_folio(swap_address_space(entry),
> +						swap_cache_index(entry));
> +	if (!IS_ERR(folio))
> +		return folio;
> +	return NULL;

Maybe better as (avoid one !)

if (IS_ERR(folio))
	return NULL;
return folio;

or simply

return IS_ERR(folio) ? NULL : folio.

> +}
> +
>   void *get_shadow_from_swap_cache(swp_entry_t entry)
>   {
>   	struct address_space *address_space = swap_address_space(entry);
> @@ -273,54 +288,40 @@ static inline bool swap_use_vma_readahead(void)
>   }
>   
>   /*
> - * Lookup a swap entry in the swap cache. A found folio will be returned
> - * unlocked and with its refcount incremented - we rely on the kernel
> - * lock getting page table operations atomic even if we drop the folio
> - * lock before returning.
> - *
> - * Caller must lock the swap device or hold a reference to keep it valid.
> + * Update the readahead statistics of a vma or globally.
>    */
> -struct folio *swap_cache_get_folio(swp_entry_t entry,
> -		struct vm_area_struct *vma, unsigned long addr)

This also sounds like a good kerneldoc candidate :)

In particular, documenting that it is valid to pass in vma == NULL (in 
which case the addr is ignored).

> +void swap_update_readahead(struct folio *folio,
> +			   struct vm_area_struct *vma,
> +			   unsigned long addr)
>   {


Apart from that LGTM.

-- 
Cheers

David / dhildenb

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by Kairui Song 1 month ago

On Tue, Sep 2, 2025 at 7:22 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 22.08.25 21:20, Kairui Song wrote:
> > From: Kairui Song <kasong@tencent.com>
> >
> > Always use swap_cache_get_folio for swap cache folio look up. The reason
> > we are not using it in all places is that it also updates the readahead
> > info, and some callsites want to avoid that.
> >
> > So decouple readahead update with swap cache lookup into a standalone
> > helper, let the caller call the readahead update helper if that's
> > needed. And convert all swap cache lookups to use swap_cache_get_folio.
> >
> > After this commit, there are only three special cases for accessing swap
> > cache space now: huge memory splitting, migration and shmem replacing,
> > because they need to lock the Xarray. Following commits will wrap their
> > accesses to the swap cache too with special helpers.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >   mm/memory.c      |  6 ++-
> >   mm/mincore.c     |  3 +-
> >   mm/shmem.c       |  4 +-
> >   mm/swap.h        | 13 +++++--
> >   mm/swap_state.c  | 99 +++++++++++++++++++++++-------------------------
> >   mm/swapfile.c    | 11 +++---
> >   mm/userfaultfd.c |  5 +--
> >   7 files changed, 72 insertions(+), 69 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index d9de6c056179..10ef528a5f44 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4660,9 +4660,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >       if (unlikely(!si))
> >               goto out;
> >
> > -     folio = swap_cache_get_folio(entry, vma, vmf->address);
> > -     if (folio)
> > +     folio = swap_cache_get_folio(entry);
> > +     if (folio) {
> > +             swap_update_readahead(folio, vma, vmf->address);
> >               page = folio_file_page(folio, swp_offset(entry));
> > +     }
> >       swapcache = folio;
> >
> >       if (!folio) {
> > diff --git a/mm/mincore.c b/mm/mincore.c
> > index 2f3e1816a30d..8ec4719370e1 100644
> > --- a/mm/mincore.c
> > +++ b/mm/mincore.c
> > @@ -76,8 +76,7 @@ static unsigned char mincore_swap(swp_entry_t entry, bool shmem)
> >               if (!si)
> >                       return 0;
> >       }
> > -     folio = filemap_get_entry(swap_address_space(entry),
> > -                               swap_cache_index(entry));
> > +     folio = swap_cache_get_folio(entry);
> >       if (shmem)
> >               put_swap_device(si);
> >       /* The swap cache space contains either folio, shadow or NULL */
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 13cc51df3893..e9d0d2784cd5 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -2354,7 +2354,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >       }
> >
> >       /* Look it up and read it in.. */
> > -     folio = swap_cache_get_folio(swap, NULL, 0);
> > +     folio = swap_cache_get_folio(swap);
> >       if (!folio) {
> >               if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
> >                       /* Direct swapin skipping swap cache & readahead */
> > @@ -2379,6 +2379,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >                       count_vm_event(PGMAJFAULT);
> >                       count_memcg_event_mm(fault_mm, PGMAJFAULT);
> >               }
> > +     } else {
> > +             swap_update_readahead(folio, NULL, 0);
> >       }
> >
> >       if (order > folio_order(folio)) {
> > diff --git a/mm/swap.h b/mm/swap.h
> > index 1ae44d4193b1..efb6d7ff9f30 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -62,8 +62,7 @@ void delete_from_swap_cache(struct folio *folio);
> >   void clear_shadow_from_swap_cache(int type, unsigned long begin,
> >                                 unsigned long end);
> >   void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
> > -struct folio *swap_cache_get_folio(swp_entry_t entry,
> > -             struct vm_area_struct *vma, unsigned long addr);
> > +struct folio *swap_cache_get_folio(swp_entry_t entry);
> >   struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> >               struct vm_area_struct *vma, unsigned long addr,
> >               struct swap_iocb **plug);
> > @@ -74,6 +73,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
> >               struct mempolicy *mpol, pgoff_t ilx);
> >   struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
> >               struct vm_fault *vmf);
> > +void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
> > +                        unsigned long addr);
> >
> >   static inline unsigned int folio_swap_flags(struct folio *folio)
> >   {
> > @@ -159,6 +160,11 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
> >       return NULL;
> >   }
> >
> > +static inline void swap_update_readahead(struct folio *folio,
> > +             struct vm_area_struct *vma, unsigned long addr)
> > +{
> > +}
> > +
> >   static inline int swap_writeout(struct folio *folio,
> >               struct swap_iocb **swap_plug)
> >   {
> > @@ -169,8 +175,7 @@ static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entr
> >   {
> >   }
> >
> > -static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
> > -             struct vm_area_struct *vma, unsigned long addr)
> > +static inline struct folio *swap_cache_get_folio(swp_entry_t entry)
> >   {
> >       return NULL;
> >   }
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index 99513b74b5d8..ff9eb761a103 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -69,6 +69,21 @@ void show_swap_cache_info(void)
> >       printk("Total swap = %lukB\n", K(total_swap_pages));
> >   }
> >
> > +/*
>
> While at it, proper kerneldoc?
>
> /**
>
> etc.
>
> Also documenting that it will only return a valid folio pointer or NULL

Good suggestion. I added some kerneldoc in later commit for this
function, do it earlier here is better.

>
> > + * Lookup a swap entry in the swap cache. A found folio will be returned
> > + * unlocked and with its refcount incremented.
> > + *
> > + * Caller must lock the swap device or hold a reference to keep it valid.
> > + */
> > +struct folio *swap_cache_get_folio(swp_entry_t entry)
> > +{
> > +     struct folio *folio = filemap_get_folio(swap_address_space(entry),
> > +                                             swap_cache_index(entry));
> > +     if (!IS_ERR(folio))
> > +             return folio;
> > +     return NULL;
>
> Maybe better as (avoid one !)
>
> if (IS_ERR(folio))
>         return NULL;
> return folio;
>
> or simply
>
> return IS_ERR(folio) ? NULL : folio.
>
> > +}
> > +
> >   void *get_shadow_from_swap_cache(swp_entry_t entry)
> >   {
> >       struct address_space *address_space = swap_address_space(entry);
> > @@ -273,54 +288,40 @@ static inline bool swap_use_vma_readahead(void)
> >   }
> >
> >   /*
> > - * Lookup a swap entry in the swap cache. A found folio will be returned
> > - * unlocked and with its refcount incremented - we rely on the kernel
> > - * lock getting page table operations atomic even if we drop the folio
> > - * lock before returning.
> > - *
> > - * Caller must lock the swap device or hold a reference to keep it valid.
> > + * Update the readahead statistics of a vma or globally.
> >    */
> > -struct folio *swap_cache_get_folio(swp_entry_t entry,
> > -             struct vm_area_struct *vma, unsigned long addr)
>
> This also sounds like a good kerneldoc candidate :)
>
> In particular, documenting that it is valid to pass in vma == NULL (in
> which case the addr is ignored).

Agree, I forgot this one, will add some doc.

>
> > +void swap_update_readahead(struct folio *folio,
> > +                        struct vm_area_struct *vma,
> > +                        unsigned long addr)
> >   {
>
>
> Apart from that LGTM.

Thanks!

>
> --
> Cheers
>
> David / dhildenb
>
>

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by Chris Li 1 month ago

On Tue, Sep 2, 2025 at 3:07 AM David Hildenbrand <david@redhat.com> wrote:
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index 99513b74b5d8..ff9eb761a103 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -69,6 +69,21 @@ void show_swap_cache_info(void)
> >       printk("Total swap = %lukB\n", K(total_swap_pages));
> >   }
> >
> > +/*
>
> While at it, proper kerneldoc?

Agree, add the kerneldoc while we are at it. Those are important API
to interact with the swap table.

BTW, I already submitted a design doc for swap table to Kairui, which
will show up as the first patch in the V2 series

Chris

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by David Hildenbrand 1 month ago

On 02.09.25 14:32, Chris Li wrote:
> On Tue, Sep 2, 2025 at 3:07 AM David Hildenbrand <david@redhat.com> wrote:
>>> diff --git a/mm/swap_state.c b/mm/swap_state.c
>>> index 99513b74b5d8..ff9eb761a103 100644
>>> --- a/mm/swap_state.c
>>> +++ b/mm/swap_state.c
>>> @@ -69,6 +69,21 @@ void show_swap_cache_info(void)
>>>        printk("Total swap = %lukB\n", K(total_swap_pages));
>>>    }
>>>
>>> +/*
>>
>> While at it, proper kerneldoc?
> 
> Agree, add the kerneldoc while we are at it. Those are important API
> to interact with the swap table.
> 
> BTW, I already submitted a design doc for swap table to Kairui, which
> will show up as the first patch in the V2 series

Nice!

-- 
Cheers

David / dhildenb

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by Barry Song 1 month ago

On Sat, Aug 23, 2025 at 3:20 AM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> Always use swap_cache_get_folio for swap cache folio look up. The reason
> we are not using it in all places is that it also updates the readahead
> info, and some callsites want to avoid that.
>
> So decouple readahead update with swap cache lookup into a standalone
> helper, let the caller call the readahead update helper if that's
> needed. And convert all swap cache lookups to use swap_cache_get_folio.
>
> After this commit, there are only three special cases for accessing swap
> cache space now: huge memory splitting, migration and shmem replacing,
> because they need to lock the Xarray. Following commits will wrap their
> accesses to the swap cache too with special helpers.
>
> Signed-off-by: Kairui Song <kasong@tencent.com>

Nice! This has cleaned up the confusing mix of using
swap_cache_get_folio with VMA, filemap_get_entry,
swap_cache_get_folio without VMA, and filemap_get_folio.

Reviewed-by: Barry Song <baohua@kernel.org>

Do we have any potential "dropbehind" cases for anonymous folios?
I guess not for now.

__filemap_get_folio()
{
        if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))
                folio_clear_dropbehind(folio);
}

Can we mention something about it in the changelog?

Best regards
Barry

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by Kairui Song 1 month ago

On Tue, Sep 2, 2025 at 9:14 AM Barry Song <21cnbao@gmail.com> wrote:
>
> On Sat, Aug 23, 2025 at 3:20 AM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > Always use swap_cache_get_folio for swap cache folio look up. The reason
> > we are not using it in all places is that it also updates the readahead
> > info, and some callsites want to avoid that.
> >
> > So decouple readahead update with swap cache lookup into a standalone
> > helper, let the caller call the readahead update helper if that's
> > needed. And convert all swap cache lookups to use swap_cache_get_folio.
> >
> > After this commit, there are only three special cases for accessing swap
> > cache space now: huge memory splitting, migration and shmem replacing,
> > because they need to lock the Xarray. Following commits will wrap their
> > accesses to the swap cache too with special helpers.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
>
> Nice! This has cleaned up the confusing mix of using
> swap_cache_get_folio with VMA, filemap_get_entry,
> swap_cache_get_folio without VMA, and filemap_get_folio.
>
> Reviewed-by: Barry Song <baohua@kernel.org>

Thanks!

>
> Do we have any potential "dropbehind" cases for anonymous folios?
> I guess not for now.
>

Right, dropbehind doesn't apply to anon yet.

> __filemap_get_folio()
> {
>         if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))
>                 folio_clear_dropbehind(folio);
> }
>
> Can we mention something about it in the changelog?

I can add some words about it in the commit message. One can easily
tell that if we want dropbehind for anon, swap_caceh_get_folio will be
the right place to handle related logics.

Now with swap_cache_get_folio being the only place for swap cache
lookup, and in the next phase we'll make the swap cache layer the
unified way to do swap synchronization and never bypass it, maybe
dropbehind will be easier to do too.

>
> Best regards
> Barry
>

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by Chris Li 1 month ago

On Mon, Sep 1, 2025 at 11:13 PM Kairui Song <ryncsn@gmail.com> wrote:
>
> On Tue, Sep 2, 2025 at 9:14 AM Barry Song <21cnbao@gmail.com> wrote:
> >
> > On Sat, Aug 23, 2025 at 3:20 AM Kairui Song <ryncsn@gmail.com> wrote:
> > >
> > > From: Kairui Song <kasong@tencent.com>
> > >
> > > Always use swap_cache_get_folio for swap cache folio look up. The reason
> > > we are not using it in all places is that it also updates the readahead
> > > info, and some callsites want to avoid that.
> > >
> > > So decouple readahead update with swap cache lookup into a standalone
> > > helper, let the caller call the readahead update helper if that's
> > > needed. And convert all swap cache lookups to use swap_cache_get_folio.
> > >
> > > After this commit, there are only three special cases for accessing swap
> > > cache space now: huge memory splitting, migration and shmem replacing,
> > > because they need to lock the Xarray. Following commits will wrap their
> > > accesses to the swap cache too with special helpers.
> > >
> > > Signed-off-by: Kairui Song <kasong@tencent.com>
> >
> > Nice! This has cleaned up the confusing mix of using
> > swap_cache_get_folio with VMA, filemap_get_entry,
> > swap_cache_get_folio without VMA, and filemap_get_folio.
> >
> > Reviewed-by: Barry Song <baohua@kernel.org>
>
> Thanks!
>
> >
> > Do we have any potential "dropbehind" cases for anonymous folios?
> > I guess not for now.
> >
>
> Right, dropbehind doesn't apply to anon yet.
>
> > __filemap_get_folio()
> > {
> >         if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))
> >                 folio_clear_dropbehind(folio);
> > }
> >
> > Can we mention something about it in the changelog?
>
> I can add some words about it in the commit message. One can easily
> tell that if we want dropbehind for anon, swap_caceh_get_folio will be
> the right place to handle related logics.
>
> Now with swap_cache_get_folio being the only place for swap cache
> lookup, and in the next phase we'll make the swap cache layer the
> unified way to do swap synchronization and never bypass it, maybe
> dropbehind will be easier to do too.

Thanks for the cleaning up and unified swap cache synchronization.

Really appreciate it.

Chris

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by Baolin Wang 1 month ago


On 2025/8/23 03:20, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
> 
> Always use swap_cache_get_folio for swap cache folio look up. The reason
> we are not using it in all places is that it also updates the readahead
> info, and some callsites want to avoid that.
> 
> So decouple readahead update with swap cache lookup into a standalone
> helper, let the caller call the readahead update helper if that's
> needed. And convert all swap cache lookups to use swap_cache_get_folio.
> 
> After this commit, there are only three special cases for accessing swap
> cache space now: huge memory splitting, migration and shmem replacing,
> because they need to lock the Xarray. Following commits will wrap their
> accesses to the swap cache too with special helpers.
> 
> Signed-off-by: Kairui Song <kasong@tencent.com>

LGTM. With the commit message updated:
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by Baoquan He 1 month, 1 week ago

On 08/23/25 at 03:20am, Kairui Song wrote:
> From: Kairui Song <kasong@tencent.com>
......snip...
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 99513b74b5d8..ff9eb761a103 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -69,6 +69,21 @@ void show_swap_cache_info(void)
>  	printk("Total swap = %lukB\n", K(total_swap_pages));
>  }
>  
> +/*
> + * Lookup a swap entry in the swap cache. A found folio will be returned

Lookup is a noun, we should use 'look up' which is a verb here instead?

And all other places in swap code, even though they are not introduced
by this patchset. Just a nitpick.

> + * unlocked and with its refcount incremented.
> + *
> + * Caller must lock the swap device or hold a reference to keep it valid.
> + */
> +struct folio *swap_cache_get_folio(swp_entry_t entry)
> +{
> +	struct folio *folio = filemap_get_folio(swap_address_space(entry),
> +						swap_cache_index(entry));
> +	if (!IS_ERR(folio))
> +		return folio;
> +	return NULL;
> +}
> +
>  void *get_shadow_from_swap_cache(swp_entry_t entry)
>  {
>  	struct address_space *address_space = swap_address_space(entry);
> @@ -273,54 +288,40 @@ static inline bool swap_use_vma_readahead(void)
>  }
>  
>  /*
> - * Lookup a swap entry in the swap cache. A found folio will be returned
> - * unlocked and with its refcount incremented - we rely on the kernel
> - * lock getting page table operations atomic even if we drop the folio
> - * lock before returning.
> - *
> - * Caller must lock the swap device or hold a reference to keep it valid.
> + * Update the readahead statistics of a vma or globally.
>   */
> -struct folio *swap_cache_get_folio(swp_entry_t entry,
> -		struct vm_area_struct *vma, unsigned long addr)
> +void swap_update_readahead(struct folio *folio,
> +			   struct vm_area_struct *vma,
> +			   unsigned long addr)
>  {
> -	struct folio *folio;
> -
> -	folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
> -	if (!IS_ERR(folio)) {
> -		bool vma_ra = swap_use_vma_readahead();
> -		bool readahead;
> +	bool readahead, vma_ra = swap_use_vma_readahead();
>  
> -		/*
> -		 * At the moment, we don't support PG_readahead for anon THP
> -		 * so let's bail out rather than confusing the readahead stat.
> -		 */
> -		if (unlikely(folio_test_large(folio)))
> -			return folio;
> -
> -		readahead = folio_test_clear_readahead(folio);
> -		if (vma && vma_ra) {
> -			unsigned long ra_val;
> -			int win, hits;
> -
> -			ra_val = GET_SWAP_RA_VAL(vma);
> -			win = SWAP_RA_WIN(ra_val);
> -			hits = SWAP_RA_HITS(ra_val);
> -			if (readahead)
> -				hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
> -			atomic_long_set(&vma->swap_readahead_info,
> -					SWAP_RA_VAL(addr, win, hits));
> -		}
> -
> -		if (readahead) {
> -			count_vm_event(SWAP_RA_HIT);
> -			if (!vma || !vma_ra)
> -				atomic_inc(&swapin_readahead_hits);
> -		}
> -	} else {
> -		folio = NULL;
> +	/*
> +	 * At the moment, we don't support PG_readahead for anon THP
> +	 * so let's bail out rather than confusing the readahead stat.
> +	 */
> +	if (unlikely(folio_test_large(folio)))
> +		return;
> +
> +	readahead = folio_test_clear_readahead(folio);
> +	if (vma && vma_ra) {
> +		unsigned long ra_val;
> +		int win, hits;
> +
> +		ra_val = GET_SWAP_RA_VAL(vma);
> +		win = SWAP_RA_WIN(ra_val);
> +		hits = SWAP_RA_HITS(ra_val);
> +		if (readahead)
> +			hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
> +		atomic_long_set(&vma->swap_readahead_info,
> +				SWAP_RA_VAL(addr, win, hits));
>  	}
>  
> -	return folio;
> +	if (readahead) {
> +		count_vm_event(SWAP_RA_HIT);
> +		if (!vma || !vma_ra)
> +			atomic_inc(&swapin_readahead_hits);
> +	}
>  }
>  
>  struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> @@ -336,14 +337,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>  	*new_page_allocated = false;
>  	for (;;) {
>  		int err;
> -		/*
> -		 * First check the swap cache.  Since this is normally
> -		 * called after swap_cache_get_folio() failed, re-calling
> -		 * that would confuse statistics.
> -		 */
> -		folio = filemap_get_folio(swap_address_space(entry),
> -					  swap_cache_index(entry));
> -		if (!IS_ERR(folio))
> +
> +		/* Check the swap cache in case the folio is already there */
> +		folio = swap_cache_get_folio(entry);
> +		if (folio)
>  			goto got_folio;
>  
>  		/*
......

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by Kairui Song 1 month, 1 week ago

On Wed, Aug 27, 2025 at 12:48 PM Baoquan He <bhe@redhat.com> wrote:
>
> On 08/23/25 at 03:20am, Kairui Song wrote:
> > From: Kairui Song <kasong@tencent.com>
> ......snip...
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index 99513b74b5d8..ff9eb761a103 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -69,6 +69,21 @@ void show_swap_cache_info(void)
> >       printk("Total swap = %lukB\n", K(total_swap_pages));
> >  }
> >
> > +/*
> > + * Lookup a swap entry in the swap cache. A found folio will be returned
>
> Lookup is a noun, we should use 'look up' which is a verb here instead?

Hi Baoquan,

I just checked filemap.c to see how page cache helpers describe
themselves, 'Look up' is better indeed. Thanks for the review.

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by Chris Li 1 month, 1 week ago

Hi Kairui,

This commit message can use some improvement, I feel the part I am
interested in, what changed is buried in a lot of detail.

The background is that swap_cache_get_folio() used to do readahead
update as well. It has VMA as part of the argument. However, the
hibernation usage does not map swap entry to VMA. It was forced to
call filemap_get_entry() on swap cache instead, due to no VMA.

So the TL; DR; of what this patch does:

Split the swap readahead outside of swap_cache_get_folio(), so that
the hibernation non VMA usage can reuse swap_cache_get_folio()  as
well. No more  calling filemap_get_entry() on swap cache due to lack
of VMA.

The code itself looks fine. It has gone through some rounds of
feedback from me already. We can always update the commit message on
the next iteration.

Acked-by: Chris Li <chrisl@kernel.org>

Chris


On Fri, Aug 22, 2025 at 12:20 PM Kairui Song <ryncsn@gmail.com> wrote:
>
> From: Kairui Song <kasong@tencent.com>
>
> Always use swap_cache_get_folio for swap cache folio look up. The reason
> we are not using it in all places is that it also updates the readahead
> info, and some callsites want to avoid that.
>
> So decouple readahead update with swap cache lookup into a standalone
> helper, let the caller call the readahead update helper if that's
> needed. And convert all swap cache lookups to use swap_cache_get_folio.
>
> After this commit, there are only three special cases for accessing swap
> cache space now: huge memory splitting, migration and shmem replacing,
> because they need to lock the Xarray. Following commits will wrap their
I commonly saw using xarray or XArray.
> accesses to the swap cache too with special helpers.
>
> Signed-off-by: Kairui Song <kasong@tencent.com>
> ---
>  mm/memory.c      |  6 ++-
>  mm/mincore.c     |  3 +-
>  mm/shmem.c       |  4 +-
>  mm/swap.h        | 13 +++++--
>  mm/swap_state.c  | 99 +++++++++++++++++++++++-------------------------
>  mm/swapfile.c    | 11 +++---
>  mm/userfaultfd.c |  5 +--
>  7 files changed, 72 insertions(+), 69 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index d9de6c056179..10ef528a5f44 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4660,9 +4660,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>         if (unlikely(!si))
>                 goto out;
>
> -       folio = swap_cache_get_folio(entry, vma, vmf->address);
> -       if (folio)
> +       folio = swap_cache_get_folio(entry);
> +       if (folio) {
> +               swap_update_readahead(folio, vma, vmf->address);
>                 page = folio_file_page(folio, swp_offset(entry));
> +       }
>         swapcache = folio;
>
>         if (!folio) {
> diff --git a/mm/mincore.c b/mm/mincore.c
> index 2f3e1816a30d..8ec4719370e1 100644
> --- a/mm/mincore.c
> +++ b/mm/mincore.c
> @@ -76,8 +76,7 @@ static unsigned char mincore_swap(swp_entry_t entry, bool shmem)
>                 if (!si)
>                         return 0;
>         }
> -       folio = filemap_get_entry(swap_address_space(entry),
> -                                 swap_cache_index(entry));
> +       folio = swap_cache_get_folio(entry);
>         if (shmem)
>                 put_swap_device(si);
>         /* The swap cache space contains either folio, shadow or NULL */
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 13cc51df3893..e9d0d2784cd5 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -2354,7 +2354,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>         }
>
>         /* Look it up and read it in.. */
> -       folio = swap_cache_get_folio(swap, NULL, 0);
> +       folio = swap_cache_get_folio(swap);
>         if (!folio) {
>                 if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
>                         /* Direct swapin skipping swap cache & readahead */
> @@ -2379,6 +2379,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
>                         count_vm_event(PGMAJFAULT);
>                         count_memcg_event_mm(fault_mm, PGMAJFAULT);
>                 }
> +       } else {
> +               swap_update_readahead(folio, NULL, 0);
>         }
>
>         if (order > folio_order(folio)) {
> diff --git a/mm/swap.h b/mm/swap.h
> index 1ae44d4193b1..efb6d7ff9f30 100644
> --- a/mm/swap.h
> +++ b/mm/swap.h
> @@ -62,8 +62,7 @@ void delete_from_swap_cache(struct folio *folio);
>  void clear_shadow_from_swap_cache(int type, unsigned long begin,
>                                   unsigned long end);
>  void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
> -struct folio *swap_cache_get_folio(swp_entry_t entry,
> -               struct vm_area_struct *vma, unsigned long addr);
> +struct folio *swap_cache_get_folio(swp_entry_t entry);
>  struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>                 struct vm_area_struct *vma, unsigned long addr,
>                 struct swap_iocb **plug);
> @@ -74,6 +73,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
>                 struct mempolicy *mpol, pgoff_t ilx);
>  struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
>                 struct vm_fault *vmf);
> +void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
> +                          unsigned long addr);
>
>  static inline unsigned int folio_swap_flags(struct folio *folio)
>  {
> @@ -159,6 +160,11 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
>         return NULL;
>  }
>
> +static inline void swap_update_readahead(struct folio *folio,
> +               struct vm_area_struct *vma, unsigned long addr)
> +{
> +}
> +
>  static inline int swap_writeout(struct folio *folio,
>                 struct swap_iocb **swap_plug)
>  {
> @@ -169,8 +175,7 @@ static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entr
>  {
>  }
>
> -static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
> -               struct vm_area_struct *vma, unsigned long addr)
> +static inline struct folio *swap_cache_get_folio(swp_entry_t entry)
>  {
>         return NULL;
>  }
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index 99513b74b5d8..ff9eb761a103 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -69,6 +69,21 @@ void show_swap_cache_info(void)
>         printk("Total swap = %lukB\n", K(total_swap_pages));
>  }
>
> +/*
> + * Lookup a swap entry in the swap cache. A found folio will be returned
> + * unlocked and with its refcount incremented.
> + *
> + * Caller must lock the swap device or hold a reference to keep it valid.
> + */
> +struct folio *swap_cache_get_folio(swp_entry_t entry)
> +{
> +       struct folio *folio = filemap_get_folio(swap_address_space(entry),
> +                                               swap_cache_index(entry));
> +       if (!IS_ERR(folio))
> +               return folio;
> +       return NULL;
> +}
> +
>  void *get_shadow_from_swap_cache(swp_entry_t entry)
>  {
>         struct address_space *address_space = swap_address_space(entry);
> @@ -273,54 +288,40 @@ static inline bool swap_use_vma_readahead(void)
>  }
>
>  /*
> - * Lookup a swap entry in the swap cache. A found folio will be returned
> - * unlocked and with its refcount incremented - we rely on the kernel
> - * lock getting page table operations atomic even if we drop the folio
> - * lock before returning.
> - *
> - * Caller must lock the swap device or hold a reference to keep it valid.
> + * Update the readahead statistics of a vma or globally.
>   */
> -struct folio *swap_cache_get_folio(swp_entry_t entry,
> -               struct vm_area_struct *vma, unsigned long addr)
> +void swap_update_readahead(struct folio *folio,
> +                          struct vm_area_struct *vma,
> +                          unsigned long addr)
>  {
> -       struct folio *folio;
> -
> -       folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
> -       if (!IS_ERR(folio)) {
> -               bool vma_ra = swap_use_vma_readahead();
> -               bool readahead;
> +       bool readahead, vma_ra = swap_use_vma_readahead();
>
> -               /*
> -                * At the moment, we don't support PG_readahead for anon THP
> -                * so let's bail out rather than confusing the readahead stat.
> -                */
> -               if (unlikely(folio_test_large(folio)))
> -                       return folio;
> -
> -               readahead = folio_test_clear_readahead(folio);
> -               if (vma && vma_ra) {
> -                       unsigned long ra_val;
> -                       int win, hits;
> -
> -                       ra_val = GET_SWAP_RA_VAL(vma);
> -                       win = SWAP_RA_WIN(ra_val);
> -                       hits = SWAP_RA_HITS(ra_val);
> -                       if (readahead)
> -                               hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
> -                       atomic_long_set(&vma->swap_readahead_info,
> -                                       SWAP_RA_VAL(addr, win, hits));
> -               }
> -
> -               if (readahead) {
> -                       count_vm_event(SWAP_RA_HIT);
> -                       if (!vma || !vma_ra)
> -                               atomic_inc(&swapin_readahead_hits);
> -               }
> -       } else {
> -               folio = NULL;
> +       /*
> +        * At the moment, we don't support PG_readahead for anon THP
> +        * so let's bail out rather than confusing the readahead stat.
> +        */
> +       if (unlikely(folio_test_large(folio)))
> +               return;
> +
> +       readahead = folio_test_clear_readahead(folio);
> +       if (vma && vma_ra) {
> +               unsigned long ra_val;
> +               int win, hits;
> +
> +               ra_val = GET_SWAP_RA_VAL(vma);
> +               win = SWAP_RA_WIN(ra_val);
> +               hits = SWAP_RA_HITS(ra_val);
> +               if (readahead)
> +                       hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
> +               atomic_long_set(&vma->swap_readahead_info,
> +                               SWAP_RA_VAL(addr, win, hits));
>         }
>
> -       return folio;
> +       if (readahead) {
> +               count_vm_event(SWAP_RA_HIT);
> +               if (!vma || !vma_ra)
> +                       atomic_inc(&swapin_readahead_hits);
> +       }
>  }
>
>  struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> @@ -336,14 +337,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>         *new_page_allocated = false;
>         for (;;) {
>                 int err;
> -               /*
> -                * First check the swap cache.  Since this is normally
> -                * called after swap_cache_get_folio() failed, re-calling
> -                * that would confuse statistics.
> -                */
> -               folio = filemap_get_folio(swap_address_space(entry),
> -                                         swap_cache_index(entry));
> -               if (!IS_ERR(folio))
> +
> +               /* Check the swap cache in case the folio is already there */
> +               folio = swap_cache_get_folio(entry);
> +               if (folio)
>                         goto got_folio;
>
>                 /*
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index a7ffabbe65ef..4b8ab2cb49ca 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -213,15 +213,14 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
>                                  unsigned long offset, unsigned long flags)
>  {
>         swp_entry_t entry = swp_entry(si->type, offset);
> -       struct address_space *address_space = swap_address_space(entry);
>         struct swap_cluster_info *ci;
>         struct folio *folio;
>         int ret, nr_pages;
>         bool need_reclaim;
>
>  again:
> -       folio = filemap_get_folio(address_space, swap_cache_index(entry));
> -       if (IS_ERR(folio))
> +       folio = swap_cache_get_folio(entry);
> +       if (!folio)
>                 return 0;
>
>         nr_pages = folio_nr_pages(folio);
> @@ -2131,7 +2130,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
>                 pte_unmap(pte);
>                 pte = NULL;
>
> -               folio = swap_cache_get_folio(entry, vma, addr);
> +               folio = swap_cache_get_folio(entry);
>                 if (!folio) {
>                         struct vm_fault vmf = {
>                                 .vma = vma,
> @@ -2357,8 +2356,8 @@ static int try_to_unuse(unsigned int type)
>                (i = find_next_to_unuse(si, i)) != 0) {
>
>                 entry = swp_entry(type, i);
> -               folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
> -               if (IS_ERR(folio))
> +               folio = swap_cache_get_folio(entry);
> +               if (!folio)
>                         continue;
>
>                 /*
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 50aaa8dcd24c..af61b95c89e4 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -1489,9 +1489,8 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd
>                  * separately to allow proper handling.
>                  */
>                 if (!src_folio)
> -                       folio = filemap_get_folio(swap_address_space(entry),
> -                                       swap_cache_index(entry));
> -               if (!IS_ERR_OR_NULL(folio)) {
> +                       folio = swap_cache_get_folio(entry);
> +               if (folio) {
>                         if (folio_test_large(folio)) {
>                                 ret = -EBUSY;
>                                 folio_put(folio);
> --
> 2.51.0
>

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by Kairui Song 1 month, 1 week ago

On Wed, Aug 27, 2025 at 10:59 AM Chris Li <chrisl@kernel.org> wrote:
>
> Hi Kairui,
>
> This commit message can use some improvement, I feel the part I am
> interested in, what changed is buried in a lot of detail.
>
> The background is that swap_cache_get_folio() used to do readahead
> update as well. It has VMA as part of the argument. However, the
> hibernation usage does not map swap entry to VMA. It was forced to
> call filemap_get_entry() on swap cache instead, due to no VMA.
>
> So the TL; DR; of what this patch does:
>
> Split the swap readahead outside of swap_cache_get_folio(), so that
> the hibernation non VMA usage can reuse swap_cache_get_folio()  as
> well. No more  calling filemap_get_entry() on swap cache due to lack
> of VMA.
>
> The code itself looks fine. It has gone through some rounds of
> feedback from me already. We can always update the commit message on
> the next iteration.
>
> Acked-by: Chris Li <chrisl@kernel.org>

Thanks for the review and suggestions. Sounds good to me, I'll update
this commit message accordingly in the next version.

Re: [PATCH 1/9] mm, swap: use unified helper for swap cache look up

Posted by Chris Li 1 month, 1 week ago

BTW, I think this patch can add no functional change as expected.

Chris

On Tue, Aug 26, 2025 at 7:47 PM Chris Li <chrisl@kernel.org> wrote:
>
> Hi Kairui,
>
> This commit message can use some improvement, I feel the part I am
> interested in, what changed is buried in a lot of detail.
>
> The background is that swap_cache_get_folio() used to do readahead
> update as well. It has VMA as part of the argument. However, the
> hibernation usage does not map swap entry to VMA. It was forced to
> call filemap_get_entry() on swap cache instead, due to no VMA.
>
> So the TL; DR; of what this patch does:
>
> Split the swap readahead outside of swap_cache_get_folio(), so that
> the hibernation non VMA usage can reuse swap_cache_get_folio()  as
> well. No more  calling filemap_get_entry() on swap cache due to lack
> of VMA.
>
> The code itself looks fine. It has gone through some rounds of
> feedback from me already. We can always update the commit message on
> the next iteration.
>
> Acked-by: Chris Li <chrisl@kernel.org>
>
> Chris
>
>
> On Fri, Aug 22, 2025 at 12:20 PM Kairui Song <ryncsn@gmail.com> wrote:
> >
> > From: Kairui Song <kasong@tencent.com>
> >
> > Always use swap_cache_get_folio for swap cache folio look up. The reason
> > we are not using it in all places is that it also updates the readahead
> > info, and some callsites want to avoid that.
> >
> > So decouple readahead update with swap cache lookup into a standalone
> > helper, let the caller call the readahead update helper if that's
> > needed. And convert all swap cache lookups to use swap_cache_get_folio.
> >
> > After this commit, there are only three special cases for accessing swap
> > cache space now: huge memory splitting, migration and shmem replacing,
> > because they need to lock the Xarray. Following commits will wrap their
> I commonly saw using xarray or XArray.
> > accesses to the swap cache too with special helpers.
> >
> > Signed-off-by: Kairui Song <kasong@tencent.com>
> > ---
> >  mm/memory.c      |  6 ++-
> >  mm/mincore.c     |  3 +-
> >  mm/shmem.c       |  4 +-
> >  mm/swap.h        | 13 +++++--
> >  mm/swap_state.c  | 99 +++++++++++++++++++++++-------------------------
> >  mm/swapfile.c    | 11 +++---
> >  mm/userfaultfd.c |  5 +--
> >  7 files changed, 72 insertions(+), 69 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index d9de6c056179..10ef528a5f44 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -4660,9 +4660,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >         if (unlikely(!si))
> >                 goto out;
> >
> > -       folio = swap_cache_get_folio(entry, vma, vmf->address);
> > -       if (folio)
> > +       folio = swap_cache_get_folio(entry);
> > +       if (folio) {
> > +               swap_update_readahead(folio, vma, vmf->address);
> >                 page = folio_file_page(folio, swp_offset(entry));
> > +       }
> >         swapcache = folio;
> >
> >         if (!folio) {
> > diff --git a/mm/mincore.c b/mm/mincore.c
> > index 2f3e1816a30d..8ec4719370e1 100644
> > --- a/mm/mincore.c
> > +++ b/mm/mincore.c
> > @@ -76,8 +76,7 @@ static unsigned char mincore_swap(swp_entry_t entry, bool shmem)
> >                 if (!si)
> >                         return 0;
> >         }
> > -       folio = filemap_get_entry(swap_address_space(entry),
> > -                                 swap_cache_index(entry));
> > +       folio = swap_cache_get_folio(entry);
> >         if (shmem)
> >                 put_swap_device(si);
> >         /* The swap cache space contains either folio, shadow or NULL */
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 13cc51df3893..e9d0d2784cd5 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -2354,7 +2354,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >         }
> >
> >         /* Look it up and read it in.. */
> > -       folio = swap_cache_get_folio(swap, NULL, 0);
> > +       folio = swap_cache_get_folio(swap);
> >         if (!folio) {
> >                 if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
> >                         /* Direct swapin skipping swap cache & readahead */
> > @@ -2379,6 +2379,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> >                         count_vm_event(PGMAJFAULT);
> >                         count_memcg_event_mm(fault_mm, PGMAJFAULT);
> >                 }
> > +       } else {
> > +               swap_update_readahead(folio, NULL, 0);
> >         }
> >
> >         if (order > folio_order(folio)) {
> > diff --git a/mm/swap.h b/mm/swap.h
> > index 1ae44d4193b1..efb6d7ff9f30 100644
> > --- a/mm/swap.h
> > +++ b/mm/swap.h
> > @@ -62,8 +62,7 @@ void delete_from_swap_cache(struct folio *folio);
> >  void clear_shadow_from_swap_cache(int type, unsigned long begin,
> >                                   unsigned long end);
> >  void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
> > -struct folio *swap_cache_get_folio(swp_entry_t entry,
> > -               struct vm_area_struct *vma, unsigned long addr);
> > +struct folio *swap_cache_get_folio(swp_entry_t entry);
> >  struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> >                 struct vm_area_struct *vma, unsigned long addr,
> >                 struct swap_iocb **plug);
> > @@ -74,6 +73,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
> >                 struct mempolicy *mpol, pgoff_t ilx);
> >  struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
> >                 struct vm_fault *vmf);
> > +void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma,
> > +                          unsigned long addr);
> >
> >  static inline unsigned int folio_swap_flags(struct folio *folio)
> >  {
> > @@ -159,6 +160,11 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
> >         return NULL;
> >  }
> >
> > +static inline void swap_update_readahead(struct folio *folio,
> > +               struct vm_area_struct *vma, unsigned long addr)
> > +{
> > +}
> > +
> >  static inline int swap_writeout(struct folio *folio,
> >                 struct swap_iocb **swap_plug)
> >  {
> > @@ -169,8 +175,7 @@ static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entr
> >  {
> >  }
> >
> > -static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
> > -               struct vm_area_struct *vma, unsigned long addr)
> > +static inline struct folio *swap_cache_get_folio(swp_entry_t entry)
> >  {
> >         return NULL;
> >  }
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index 99513b74b5d8..ff9eb761a103 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -69,6 +69,21 @@ void show_swap_cache_info(void)
> >         printk("Total swap = %lukB\n", K(total_swap_pages));
> >  }
> >
> > +/*
> > + * Lookup a swap entry in the swap cache. A found folio will be returned
> > + * unlocked and with its refcount incremented.
> > + *
> > + * Caller must lock the swap device or hold a reference to keep it valid.
> > + */
> > +struct folio *swap_cache_get_folio(swp_entry_t entry)
> > +{
> > +       struct folio *folio = filemap_get_folio(swap_address_space(entry),
> > +                                               swap_cache_index(entry));
> > +       if (!IS_ERR(folio))
> > +               return folio;
> > +       return NULL;
> > +}
> > +
> >  void *get_shadow_from_swap_cache(swp_entry_t entry)
> >  {
> >         struct address_space *address_space = swap_address_space(entry);
> > @@ -273,54 +288,40 @@ static inline bool swap_use_vma_readahead(void)
> >  }
> >
> >  /*
> > - * Lookup a swap entry in the swap cache. A found folio will be returned
> > - * unlocked and with its refcount incremented - we rely on the kernel
> > - * lock getting page table operations atomic even if we drop the folio
> > - * lock before returning.
> > - *
> > - * Caller must lock the swap device or hold a reference to keep it valid.
> > + * Update the readahead statistics of a vma or globally.
> >   */
> > -struct folio *swap_cache_get_folio(swp_entry_t entry,
> > -               struct vm_area_struct *vma, unsigned long addr)
> > +void swap_update_readahead(struct folio *folio,
> > +                          struct vm_area_struct *vma,
> > +                          unsigned long addr)
> >  {
> > -       struct folio *folio;
> > -
> > -       folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
> > -       if (!IS_ERR(folio)) {
> > -               bool vma_ra = swap_use_vma_readahead();
> > -               bool readahead;
> > +       bool readahead, vma_ra = swap_use_vma_readahead();
> >
> > -               /*
> > -                * At the moment, we don't support PG_readahead for anon THP
> > -                * so let's bail out rather than confusing the readahead stat.
> > -                */
> > -               if (unlikely(folio_test_large(folio)))
> > -                       return folio;
> > -
> > -               readahead = folio_test_clear_readahead(folio);
> > -               if (vma && vma_ra) {
> > -                       unsigned long ra_val;
> > -                       int win, hits;
> > -
> > -                       ra_val = GET_SWAP_RA_VAL(vma);
> > -                       win = SWAP_RA_WIN(ra_val);
> > -                       hits = SWAP_RA_HITS(ra_val);
> > -                       if (readahead)
> > -                               hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
> > -                       atomic_long_set(&vma->swap_readahead_info,
> > -                                       SWAP_RA_VAL(addr, win, hits));
> > -               }
> > -
> > -               if (readahead) {
> > -                       count_vm_event(SWAP_RA_HIT);
> > -                       if (!vma || !vma_ra)
> > -                               atomic_inc(&swapin_readahead_hits);
> > -               }
> > -       } else {
> > -               folio = NULL;
> > +       /*
> > +        * At the moment, we don't support PG_readahead for anon THP
> > +        * so let's bail out rather than confusing the readahead stat.
> > +        */
> > +       if (unlikely(folio_test_large(folio)))
> > +               return;
> > +
> > +       readahead = folio_test_clear_readahead(folio);
> > +       if (vma && vma_ra) {
> > +               unsigned long ra_val;
> > +               int win, hits;
> > +
> > +               ra_val = GET_SWAP_RA_VAL(vma);
> > +               win = SWAP_RA_WIN(ra_val);
> > +               hits = SWAP_RA_HITS(ra_val);
> > +               if (readahead)
> > +                       hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
> > +               atomic_long_set(&vma->swap_readahead_info,
> > +                               SWAP_RA_VAL(addr, win, hits));
> >         }
> >
> > -       return folio;
> > +       if (readahead) {
> > +               count_vm_event(SWAP_RA_HIT);
> > +               if (!vma || !vma_ra)
> > +                       atomic_inc(&swapin_readahead_hits);
> > +       }
> >  }
> >
> >  struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> > @@ -336,14 +337,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
> >         *new_page_allocated = false;
> >         for (;;) {
> >                 int err;
> > -               /*
> > -                * First check the swap cache.  Since this is normally
> > -                * called after swap_cache_get_folio() failed, re-calling
> > -                * that would confuse statistics.
> > -                */
> > -               folio = filemap_get_folio(swap_address_space(entry),
> > -                                         swap_cache_index(entry));
> > -               if (!IS_ERR(folio))
> > +
> > +               /* Check the swap cache in case the folio is already there */
> > +               folio = swap_cache_get_folio(entry);
> > +               if (folio)
> >                         goto got_folio;
> >
> >                 /*
> > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > index a7ffabbe65ef..4b8ab2cb49ca 100644
> > --- a/mm/swapfile.c
> > +++ b/mm/swapfile.c
> > @@ -213,15 +213,14 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
> >                                  unsigned long offset, unsigned long flags)
> >  {
> >         swp_entry_t entry = swp_entry(si->type, offset);
> > -       struct address_space *address_space = swap_address_space(entry);
> >         struct swap_cluster_info *ci;
> >         struct folio *folio;
> >         int ret, nr_pages;
> >         bool need_reclaim;
> >
> >  again:
> > -       folio = filemap_get_folio(address_space, swap_cache_index(entry));
> > -       if (IS_ERR(folio))
> > +       folio = swap_cache_get_folio(entry);
> > +       if (!folio)
> >                 return 0;
> >
> >         nr_pages = folio_nr_pages(folio);
> > @@ -2131,7 +2130,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
> >                 pte_unmap(pte);
> >                 pte = NULL;
> >
> > -               folio = swap_cache_get_folio(entry, vma, addr);
> > +               folio = swap_cache_get_folio(entry);
> >                 if (!folio) {
> >                         struct vm_fault vmf = {
> >                                 .vma = vma,
> > @@ -2357,8 +2356,8 @@ static int try_to_unuse(unsigned int type)
> >                (i = find_next_to_unuse(si, i)) != 0) {
> >
> >                 entry = swp_entry(type, i);
> > -               folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
> > -               if (IS_ERR(folio))
> > +               folio = swap_cache_get_folio(entry);
> > +               if (!folio)
> >                         continue;
> >
> >                 /*
> > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > index 50aaa8dcd24c..af61b95c89e4 100644
> > --- a/mm/userfaultfd.c
> > +++ b/mm/userfaultfd.c
> > @@ -1489,9 +1489,8 @@ static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd
> >                  * separately to allow proper handling.
> >                  */
> >                 if (!src_folio)
> > -                       folio = filemap_get_folio(swap_address_space(entry),
> > -                                       swap_cache_index(entry));
> > -               if (!IS_ERR_OR_NULL(folio)) {
> > +                       folio = swap_cache_get_folio(entry);
> > +               if (folio) {
> >                         if (folio_test_large(folio)) {
> >                                 ret = -EBUSY;
> >                                 folio_put(folio);
> > --
> > 2.51.0
> >