[v1] mm/swap: optimize swap cache search space

[PATCH 8/8] mm/swap: reduce swap cache search space

Posted by Kairui Song 1 year, 10 months ago

From: Kairui Song <kasong@tencent.com>

Currently we use one swap_address_space for every 64M chunk to reduce lock
contention, this is like having a set of smaller swap files inside one
big swap file. But when doing swap cache look up or insert, we are
still using the offset of the whole large swap file. This is OK for
correctness, as the offset (key) is unique.

But Xarray is specially optimized for small indexes, it creates the
radix tree levels lazily to be just enough to fit the largest key
stored in one Xarray. So we are wasting tree nodes unnecessarily.

For 64M chunk it should only take at most 3 levels to contain everything.
But we are using the offset from the whole swap file, so the offset (key)
value will be way beyond 64M, and so will the tree level.

Optimize this by using a new helper swap_cache_index to get a swap
entry's unique offset in its own 64M swap_address_space.

I see a ~1% performance gain in benchmark and actual workload with
high memory pressure.

Test with `time memhog 128G` inside a 8G memcg using 128G swap (ramdisk
with SWP_SYNCHRONOUS_IO dropped, tested 3 times, results are stable. The
test result is similar but the improvement is smaller if SWP_SYNCHRONOUS_IO
is enabled, as swap out path can never skip swap cache):

Before:
6.07user 250.74system 4:17.26elapsed 99%CPU (0avgtext+0avgdata 8373376maxresident)k
0inputs+0outputs (55major+33555018minor)pagefaults 0swaps

After (1.8% faster):
6.08user 246.09system 4:12.58elapsed 99%CPU (0avgtext+0avgdata 8373248maxresident)k
0inputs+0outputs (54major+33555027minor)pagefaults 0swaps

Similar result with MySQL and sysbench using swap:
Before:
94055.61 qps

After (0.8% faster):
94834.91 qps

Radix tree slab usage is also very slightly lower.

Signed-off-by: Kairui Song <kasong@tencent.com>
---
 mm/huge_memory.c |  2 +-
 mm/memcontrol.c  |  2 +-
 mm/mincore.c     |  2 +-
 mm/shmem.c       |  2 +-
 mm/swap.h        |  7 +++++++
 mm/swap_state.c  | 12 ++++++------
 mm/swapfile.c    |  6 +++---
 7 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9859aa4f7553..1208d60792f0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2903,7 +2903,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 	split_page_memcg(head, order, new_order);
 
 	if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
-		offset = swp_offset(folio->swap);
+		offset = swap_cache_index(folio->swap);
 		swap_cache = swap_address_space(folio->swap);
 		xa_lock(&swap_cache->i_pages);
 	}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fabce2b50c69..04d7be7f30dc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5934,7 +5934,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 	 * Because swap_cache_get_folio() updates some statistics counter,
 	 * we call find_get_page() with swapper_space directly.
 	 */
-	page = find_get_page(swap_address_space(ent), swp_offset(ent));
+	page = find_get_page(swap_address_space(ent), swap_cache_index(ent));
 	entry->val = ent.val;
 
 	return page;
diff --git a/mm/mincore.c b/mm/mincore.c
index dad3622cc963..e31cf1bde614 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -139,7 +139,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 			} else {
 #ifdef CONFIG_SWAP
 				*vec = mincore_page(swap_address_space(entry),
-						    swp_offset(entry));
+						    swap_cache_index(entry));
 #else
 				WARN_ON(1);
 				*vec = 1;
diff --git a/mm/shmem.c b/mm/shmem.c
index 0aad0d9a621b..cbe33ab52a73 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1762,7 +1762,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
 
 	old = *foliop;
 	entry = old->swap;
-	swap_index = swp_offset(entry);
+	swap_index = swap_cache_index(entry);
 	swap_mapping = swap_address_space(entry);
 
 	/*
diff --git a/mm/swap.h b/mm/swap.h
index 2de83729aaa8..6ef237d2b029 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -31,11 +31,18 @@ void __swap_writepage(struct folio *folio, struct writeback_control *wbc);
 /* One swap address space for each 64M swap space */
 #define SWAP_ADDRESS_SPACE_SHIFT	14
 #define SWAP_ADDRESS_SPACE_PAGES	(1 << SWAP_ADDRESS_SPACE_SHIFT)
+#define SWAP_ADDRESS_SPACE_MASK		(BIT(SWAP_ADDRESS_SPACE_SHIFT) - 1)
 extern struct address_space *swapper_spaces[];
 #define swap_address_space(entry)			    \
 	(&swapper_spaces[swp_type(entry)][swp_offset(entry) \
 		>> SWAP_ADDRESS_SPACE_SHIFT])
 
+static inline pgoff_t swap_cache_index(swp_entry_t entry)
+{
+	BUILD_BUG_ON((SWP_OFFSET_MASK | SWAP_ADDRESS_SPACE_MASK) != SWP_OFFSET_MASK);
+	return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK;
+}
+
 void show_swap_cache_info(void);
 bool add_to_swap(struct folio *folio);
 void *get_shadow_from_swap_cache(swp_entry_t entry);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index bfc7e8c58a6d..9dbb54c72770 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -72,7 +72,7 @@ void show_swap_cache_info(void)
 void *get_shadow_from_swap_cache(swp_entry_t entry)
 {
 	struct address_space *address_space = swap_address_space(entry);
-	pgoff_t idx = swp_offset(entry);
+	pgoff_t idx = swap_cache_index(entry);
 	struct page *page;
 
 	page = xa_load(&address_space->i_pages, idx);
@@ -89,7 +89,7 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
 			gfp_t gfp, void **shadowp)
 {
 	struct address_space *address_space = swap_address_space(entry);
-	pgoff_t idx = swp_offset(entry);
+	pgoff_t idx = swap_cache_index(entry);
 	XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio));
 	unsigned long i, nr = folio_nr_pages(folio);
 	void *old;
@@ -144,7 +144,7 @@ void __delete_from_swap_cache(struct folio *folio,
 	struct address_space *address_space = swap_address_space(entry);
 	int i;
 	long nr = folio_nr_pages(folio);
-	pgoff_t idx = swp_offset(entry);
+	pgoff_t idx = swap_cache_index(entry);
 	XA_STATE(xas, &address_space->i_pages, idx);
 
 	xas_set_update(&xas, workingset_update_node);
@@ -350,7 +350,7 @@ struct folio *swap_cache_get_folio(swp_entry_t entry,
 {
 	struct folio *folio;
 
-	folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry));
+	folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
 	if (!IS_ERR(folio)) {
 		bool vma_ra = swap_use_vma_readahead();
 		bool readahead;
@@ -420,7 +420,7 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
 	si = get_swap_device(swp);
 	if (!si)
 		return ERR_PTR(-ENOENT);
-	index = swp_offset(swp);
+	index = swap_cache_index(swp);
 	folio = filemap_get_folio(swap_address_space(swp), index);
 	put_swap_device(si);
 	return folio;
@@ -447,7 +447,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * that would confuse statistics.
 		 */
 		folio = filemap_get_folio(swap_address_space(entry),
-						swp_offset(entry));
+					  swap_cache_index(entry));
 		if (!IS_ERR(folio))
 			goto got_folio;
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0c36a5c2400f..2e8df95977b7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -138,7 +138,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
 	struct folio *folio;
 	int ret = 0;
 
-	folio = filemap_get_folio(swap_address_space(entry), offset);
+	folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
 	if (IS_ERR(folio))
 		return 0;
 	/*
@@ -2110,7 +2110,7 @@ static int try_to_unuse(unsigned int type)
 	       (i = find_next_to_unuse(si, i)) != 0) {
 
 		entry = swp_entry(type, i);
-		folio = filemap_get_folio(swap_address_space(entry), i);
+		folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
 		if (IS_ERR(folio))
 			continue;
 
@@ -3421,7 +3421,7 @@ EXPORT_SYMBOL_GPL(swapcache_mapping);
 
 pgoff_t __folio_swap_cache_index(struct folio *folio)
 {
-	return swp_offset(folio->swap);
+	return swap_cache_index(folio->swap);
 }
 EXPORT_SYMBOL_GPL(__folio_swap_cache_index);
 
-- 
2.44.0

Re: [PATCH 8/8] mm/swap: reduce swap cache search space

Posted by kernel test robot 1 year, 10 months ago

Hi Kairui,

kernel test robot noticed the following build errors:

[auto build test ERROR on ceph-client/testing]
[also build test ERROR on ceph-client/for-linus trondmy-nfs/linux-next konis-nilfs2/upstream jaegeuk-f2fs/dev-test jaegeuk-f2fs/dev cifs/for-next linus/master v6.9-rc4]
[cannot apply to akpm-mm/mm-everything next-20240418]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Kairui-Song/NFS-remove-nfs_page_lengthg-and-usage-of-page_index/20240418-001343
base:   https://github.com/ceph/ceph-client.git testing
patch link:    https://lore.kernel.org/r/20240417160842.76665-9-ryncsn%40gmail.com
patch subject: [PATCH 8/8] mm/swap: reduce swap cache search space
config: i386-buildonly-randconfig-002-20240419 (https://download.01.org/0day-ci/archive/20240419/202404190258.wljFnvCL-lkp@intel.com/config)
compiler: gcc-9 (Ubuntu 9.5.0-4ubuntu2) 9.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240419/202404190258.wljFnvCL-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202404190258.wljFnvCL-lkp@intel.com/

All errors (new ones prefixed by >>):

   mm/huge_memory.c: In function '__split_huge_page':
>> mm/huge_memory.c:2906:12: error: implicit declaration of function 'swap_cache_index' [-Werror=implicit-function-declaration]
    2906 |   offset = swap_cache_index(folio->swap);
         |            ^~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors


vim +/swap_cache_index +2906 mm/huge_memory.c

  2888	
  2889	static void __split_huge_page(struct page *page, struct list_head *list,
  2890			pgoff_t end, unsigned int new_order)
  2891	{
  2892		struct folio *folio = page_folio(page);
  2893		struct page *head = &folio->page;
  2894		struct lruvec *lruvec;
  2895		struct address_space *swap_cache = NULL;
  2896		unsigned long offset = 0;
  2897		int i, nr_dropped = 0;
  2898		unsigned int new_nr = 1 << new_order;
  2899		int order = folio_order(folio);
  2900		unsigned int nr = 1 << order;
  2901	
  2902		/* complete memcg works before add pages to LRU */
  2903		split_page_memcg(head, order, new_order);
  2904	
  2905		if (folio_test_anon(folio) && folio_test_swapcache(folio)) {
> 2906			offset = swap_cache_index(folio->swap);
  2907			swap_cache = swap_address_space(folio->swap);
  2908			xa_lock(&swap_cache->i_pages);
  2909		}
  2910	
  2911		/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
  2912		lruvec = folio_lruvec_lock(folio);
  2913	
  2914		ClearPageHasHWPoisoned(head);
  2915	
  2916		for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
  2917			__split_huge_page_tail(folio, i, lruvec, list, new_order);
  2918			/* Some pages can be beyond EOF: drop them from page cache */
  2919			if (head[i].index >= end) {
  2920				struct folio *tail = page_folio(head + i);
  2921	
  2922				if (shmem_mapping(folio->mapping))
  2923					nr_dropped++;
  2924				else if (folio_test_clear_dirty(tail))
  2925					folio_account_cleaned(tail,
  2926						inode_to_wb(folio->mapping->host));
  2927				__filemap_remove_folio(tail, NULL);
  2928				folio_put(tail);
  2929			} else if (!PageAnon(page)) {
  2930				__xa_store(&folio->mapping->i_pages, head[i].index,
  2931						head + i, 0);
  2932			} else if (swap_cache) {
  2933				__xa_store(&swap_cache->i_pages, offset + i,
  2934						head + i, 0);
  2935			}
  2936		}
  2937	
  2938		if (!new_order)
  2939			ClearPageCompound(head);
  2940		else {
  2941			struct folio *new_folio = (struct folio *)head;
  2942	
  2943			folio_set_order(new_folio, new_order);
  2944		}
  2945		unlock_page_lruvec(lruvec);
  2946		/* Caller disabled irqs, so they are still disabled here */
  2947	
  2948		split_page_owner(head, order, new_order);
  2949	
  2950		/* See comment in __split_huge_page_tail() */
  2951		if (folio_test_anon(folio)) {
  2952			/* Additional pin to swap cache */
  2953			if (folio_test_swapcache(folio)) {
  2954				folio_ref_add(folio, 1 + new_nr);
  2955				xa_unlock(&swap_cache->i_pages);
  2956			} else {
  2957				folio_ref_inc(folio);
  2958			}
  2959		} else {
  2960			/* Additional pin to page cache */
  2961			folio_ref_add(folio, 1 + new_nr);
  2962			xa_unlock(&folio->mapping->i_pages);
  2963		}
  2964		local_irq_enable();
  2965	
  2966		if (nr_dropped)
  2967			shmem_uncharge(folio->mapping->host, nr_dropped);
  2968		remap_page(folio, nr);
  2969	
  2970		if (folio_test_swapcache(folio))
  2971			split_swap_cluster(folio->swap);
  2972	
  2973		/*
  2974		 * set page to its compound_head when split to non order-0 pages, so
  2975		 * we can skip unlocking it below, since PG_locked is transferred to
  2976		 * the compound_head of the page and the caller will unlock it.
  2977		 */
  2978		if (new_order)
  2979			page = compound_head(page);
  2980	
  2981		for (i = 0; i < nr; i += new_nr) {
  2982			struct page *subpage = head + i;
  2983			struct folio *new_folio = page_folio(subpage);
  2984			if (subpage == page)
  2985				continue;
  2986			folio_unlock(new_folio);
  2987	
  2988			/*
  2989			 * Subpages may be freed if there wasn't any mapping
  2990			 * like if add_to_swap() is running on a lru page that
  2991			 * had its mapping zapped. And freeing these pages
  2992			 * requires taking the lru_lock so we do the put_page
  2993			 * of the tail pages after the split is complete.
  2994			 */
  2995			free_page_and_swap_cache(subpage);
  2996		}
  2997	}
  2998	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH 8/8] mm/swap: reduce swap cache search space

Posted by kernel test robot 1 year, 10 months ago

Hi Kairui,

kernel test robot noticed the following build errors:

[auto build test ERROR on ceph-client/testing]
[also build test ERROR on ceph-client/for-linus trondmy-nfs/linux-next konis-nilfs2/upstream jaegeuk-f2fs/dev-test jaegeuk-f2fs/dev cifs/for-next linus/master v6.9-rc4]
[cannot apply to akpm-mm/mm-everything next-20240418]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Kairui-Song/NFS-remove-nfs_page_lengthg-and-usage-of-page_index/20240418-001343
base:   https://github.com/ceph/ceph-client.git testing
patch link:    https://lore.kernel.org/r/20240417160842.76665-9-ryncsn%40gmail.com
patch subject: [PATCH 8/8] mm/swap: reduce swap cache search space
config: alpha-allnoconfig (https://download.01.org/0day-ci/archive/20240419/202404190205.WSYYPQvi-lkp@intel.com/config)
compiler: alpha-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240419/202404190205.WSYYPQvi-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202404190205.WSYYPQvi-lkp@intel.com/

All errors (new ones prefixed by >>):

   mm/shmem.c: In function 'shmem_replace_folio':
>> mm/shmem.c:1765:22: error: implicit declaration of function 'swap_cache_index' [-Werror=implicit-function-declaration]
    1765 |         swap_index = swap_cache_index(entry);
         |                      ^~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors


vim +/swap_cache_index +1765 mm/shmem.c

  1753	
  1754	static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
  1755					struct shmem_inode_info *info, pgoff_t index)
  1756	{
  1757		struct folio *old, *new;
  1758		struct address_space *swap_mapping;
  1759		swp_entry_t entry;
  1760		pgoff_t swap_index;
  1761		int error;
  1762	
  1763		old = *foliop;
  1764		entry = old->swap;
> 1765		swap_index = swap_cache_index(entry);
  1766		swap_mapping = swap_address_space(entry);
  1767	
  1768		/*
  1769		 * We have arrived here because our zones are constrained, so don't
  1770		 * limit chance of success by further cpuset and node constraints.
  1771		 */
  1772		gfp &= ~GFP_CONSTRAINT_MASK;
  1773		VM_BUG_ON_FOLIO(folio_test_large(old), old);
  1774		new = shmem_alloc_folio(gfp, info, index);
  1775		if (!new)
  1776			return -ENOMEM;
  1777	
  1778		folio_get(new);
  1779		folio_copy(new, old);
  1780		flush_dcache_folio(new);
  1781	
  1782		__folio_set_locked(new);
  1783		__folio_set_swapbacked(new);
  1784		folio_mark_uptodate(new);
  1785		new->swap = entry;
  1786		folio_set_swapcache(new);
  1787	
  1788		/*
  1789		 * Our caller will very soon move newpage out of swapcache, but it's
  1790		 * a nice clean interface for us to replace oldpage by newpage there.
  1791		 */
  1792		xa_lock_irq(&swap_mapping->i_pages);
  1793		error = shmem_replace_entry(swap_mapping, swap_index, old, new);
  1794		if (!error) {
  1795			mem_cgroup_migrate(old, new);
  1796			__lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1);
  1797			__lruvec_stat_mod_folio(new, NR_SHMEM, 1);
  1798			__lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1);
  1799			__lruvec_stat_mod_folio(old, NR_SHMEM, -1);
  1800		}
  1801		xa_unlock_irq(&swap_mapping->i_pages);
  1802	
  1803		if (unlikely(error)) {
  1804			/*
  1805			 * Is this possible?  I think not, now that our callers check
  1806			 * both PageSwapCache and page_private after getting page lock;
  1807			 * but be defensive.  Reverse old to newpage for clear and free.
  1808			 */
  1809			old = new;
  1810		} else {
  1811			folio_add_lru(new);
  1812			*foliop = new;
  1813		}
  1814	
  1815		folio_clear_swapcache(old);
  1816		old->private = NULL;
  1817	
  1818		folio_unlock(old);
  1819		folio_put_refs(old, 2);
  1820		return error;
  1821	}
  1822	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki