[v5] mm: split underused THPs

[PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by Usama Arif 1 year, 5 months ago

From: Yu Zhao <yuzhao@google.com>

Here being unused means containing only zeros and inaccessible to
userspace. When splitting an isolated thp under reclaim or migration,
the unused subpages can be mapped to the shared zeropage, hence saving
memory. This is particularly helpful when the internal
fragmentation of a thp is high, i.e. it has many untouched subpages.

This is also a prerequisite for THP low utilization shrinker which will
be introduced in later patches, where underutilized THPs are split, and
the zero-filled pages are freed saving memory.

Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Shuang Zhai <zhais@google.com>
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
 include/linux/rmap.h |  7 ++++-
 mm/huge_memory.c     |  8 ++---
 mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++------
 mm/migrate_device.c  |  4 +--
 4 files changed, 75 insertions(+), 16 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 91b5935e8485..d5e93e44322e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
 int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
 		      struct vm_area_struct *vma);
 
-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
+enum rmp_flags {
+	RMP_LOCKED		= 1 << 0,
+	RMP_USE_SHARED_ZEROPAGE	= 1 << 1,
+};
+
+void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
 
 /*
  * rmap_walk_control: To control rmap traversing for specific needs
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0c48806ccb9a..af60684e7c70 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
 	return false;
 }
 
-static void remap_page(struct folio *folio, unsigned long nr)
+static void remap_page(struct folio *folio, unsigned long nr, int flags)
 {
 	int i = 0;
 
@@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
 	if (!folio_test_anon(folio))
 		return;
 	for (;;) {
-		remove_migration_ptes(folio, folio, true);
+		remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
 		i += folio_nr_pages(folio);
 		if (i >= nr)
 			break;
@@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 
 	if (nr_dropped)
 		shmem_uncharge(folio->mapping->host, nr_dropped);
-	remap_page(folio, nr);
+	remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
 
 	/*
 	 * set page to its compound_head when split to non order-0 pages, so
@@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 		if (mapping)
 			xas_unlock(&xas);
 		local_irq_enable();
-		remap_page(folio, folio_nr_pages(folio));
+		remap_page(folio, folio_nr_pages(folio), 0);
 		ret = -EAGAIN;
 	}
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 6f9c62c746be..d039863e014b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
 	return true;
 }
 
+static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
+					  struct folio *folio,
+					  unsigned long idx)
+{
+	struct page *page = folio_page(folio, idx);
+	bool contains_data;
+	pte_t newpte;
+	void *addr;
+
+	VM_BUG_ON_PAGE(PageCompound(page), page);
+	VM_BUG_ON_PAGE(!PageAnon(page), page);
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+	VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
+
+	if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
+	    mm_forbids_zeropage(pvmw->vma->vm_mm))
+		return false;
+
+	/*
+	 * The pmd entry mapping the old thp was flushed and the pte mapping
+	 * this subpage has been non present. If the subpage is only zero-filled
+	 * then map it to the shared zeropage.
+	 */
+	addr = kmap_local_page(page);
+	contains_data = memchr_inv(addr, 0, PAGE_SIZE);
+	kunmap_local(addr);
+
+	if (contains_data)
+		return false;
+
+	newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
+					pvmw->vma->vm_page_prot));
+	set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
+
+	dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
+	return true;
+}
+
+struct rmap_walk_arg {
+	struct folio *folio;
+	bool map_unused_to_zeropage;
+};
+
 /*
  * Restore a potential migration pte to a working pte entry
  */
 static bool remove_migration_pte(struct folio *folio,
-		struct vm_area_struct *vma, unsigned long addr, void *old)
+		struct vm_area_struct *vma, unsigned long addr, void *arg)
 {
-	DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
+	struct rmap_walk_arg *rmap_walk_arg = arg;
+	DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
 
 	while (page_vma_mapped_walk(&pvmw)) {
 		rmap_t rmap_flags = RMAP_NONE;
@@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio *folio,
 			continue;
 		}
 #endif
+		if (rmap_walk_arg->map_unused_to_zeropage &&
+		    try_to_map_unused_to_zeropage(&pvmw, folio, idx))
+			continue;
 
 		folio_get(folio);
 		pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
@@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio *folio,
  * Get rid of all migration entries and replace them by
  * references to the indicated page.
  */
-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
+void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
 {
+	struct rmap_walk_arg rmap_walk_arg = {
+		.folio = src,
+		.map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
+	};
+
 	struct rmap_walk_control rwc = {
 		.rmap_one = remove_migration_pte,
-		.arg = src,
+		.arg = &rmap_walk_arg,
 	};
 
-	if (locked)
+	VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
+
+	if (flags & RMP_LOCKED)
 		rmap_walk_locked(dst, &rwc);
 	else
 		rmap_walk(dst, &rwc);
@@ -934,7 +988,7 @@ static int writeout(struct address_space *mapping, struct folio *folio)
 	 * At this point we know that the migration attempt cannot
 	 * be successful.
 	 */
-	remove_migration_ptes(folio, folio, false);
+	remove_migration_ptes(folio, folio, 0);
 
 	rc = mapping->a_ops->writepage(&folio->page, &wbc);
 
@@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio *src,
 				   struct list_head *ret)
 {
 	if (page_was_mapped)
-		remove_migration_ptes(src, src, false);
+		remove_migration_ptes(src, src, 0);
 	/* Drop an anon_vma reference if we took one */
 	if (anon_vma)
 		put_anon_vma(anon_vma);
@@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
 		lru_add_drain();
 
 	if (old_page_state & PAGE_WAS_MAPPED)
-		remove_migration_ptes(src, dst, false);
+		remove_migration_ptes(src, dst, 0);
 
 out_unlock_both:
 	folio_unlock(dst);
@@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
 
 	if (page_was_mapped)
 		remove_migration_ptes(src,
-			rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
+			rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
 
 unlock_put_anon:
 	folio_unlock(dst);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 8d687de88a03..9cf26592ac93 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -424,7 +424,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
 			continue;
 
 		folio = page_folio(page);
-		remove_migration_ptes(folio, folio, false);
+		remove_migration_ptes(folio, folio, 0);
 
 		src_pfns[i] = 0;
 		folio_unlock(folio);
@@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
 			dst = src;
 		}
 
-		remove_migration_ptes(src, dst, false);
+		remove_migration_ptes(src, dst, 0);
 		folio_unlock(src);
 
 		if (folio_is_zone_device(src))
-- 
2.43.5

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by Qun-wei Lin (林群崴) 4 months, 3 weeks ago

On Fri, 2024-08-30 at 11:03 +0100, Usama Arif wrote:
> From: Yu Zhao <yuzhao@google.com>
> 
> Here being unused means containing only zeros and inaccessible to
> userspace. When splitting an isolated thp under reclaim or migration,
> the unused subpages can be mapped to the shared zeropage, hence
> saving
> memory. This is particularly helpful when the internal
> fragmentation of a thp is high, i.e. it has many untouched subpages.
> 
> This is also a prerequisite for THP low utilization shrinker which
> will
> be introduced in later patches, where underutilized THPs are split,
> and
> the zero-filled pages are freed saving memory.
> 
> Signed-off-by: Yu Zhao <yuzhao@google.com>
> Tested-by: Shuang Zhai <zhais@google.com>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
>  include/linux/rmap.h |  7 ++++-
>  mm/huge_memory.c     |  8 ++---
>  mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++----
> --
>  mm/migrate_device.c  |  4 +--
>  4 files changed, 75 insertions(+), 16 deletions(-)
> 
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 91b5935e8485..d5e93e44322e 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>  int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages,
> pgoff_t pgoff,
>  		      struct vm_area_struct *vma);
>  
> -void remove_migration_ptes(struct folio *src, struct folio *dst,
> bool locked);
> +enum rmp_flags {
> +	RMP_LOCKED		= 1 << 0,
> +	RMP_USE_SHARED_ZEROPAGE	= 1 << 1,
> +};
> +
> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
> flags);
>  
>  /*
>   * rmap_walk_control: To control rmap traversing for specific needs
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 0c48806ccb9a..af60684e7c70 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct
> vm_area_struct *vma, unsigned long addr,
>  	return false;
>  }
>  
> -static void remap_page(struct folio *folio, unsigned long nr)
> +static void remap_page(struct folio *folio, unsigned long nr, int
> flags)
>  {
>  	int i = 0;
>  
> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio,
> unsigned long nr)
>  	if (!folio_test_anon(folio))
>  		return;
>  	for (;;) {
> -		remove_migration_ptes(folio, folio, true);
> +		remove_migration_ptes(folio, folio, RMP_LOCKED |
> flags);
>  		i += folio_nr_pages(folio);
>  		if (i >= nr)
>  			break;
> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page
> *page, struct list_head *list,
>  
>  	if (nr_dropped)
>  		shmem_uncharge(folio->mapping->host, nr_dropped);
> -	remap_page(folio, nr);
> +	remap_page(folio, nr, PageAnon(head) ?
> RMP_USE_SHARED_ZEROPAGE : 0);
>  
>  	/*
>  	 * set page to its compound_head when split to non order-0
> pages, so
> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct
> page *page, struct list_head *list,
>  		if (mapping)
>  			xas_unlock(&xas);
>  		local_irq_enable();
> -		remap_page(folio, folio_nr_pages(folio));
> +		remap_page(folio, folio_nr_pages(folio), 0);
>  		ret = -EAGAIN;
>  	}
>  
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 6f9c62c746be..d039863e014b 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio,
> struct list_head *list)
>  	return true;
>  }
>  
> +static bool try_to_map_unused_to_zeropage(struct
> page_vma_mapped_walk *pvmw,
> +					  struct folio *folio,
> +					  unsigned long idx)
> +{
> +	struct page *page = folio_page(folio, idx);
> +	bool contains_data;
> +	pte_t newpte;
> +	void *addr;
> +
> +	VM_BUG_ON_PAGE(PageCompound(page), page);
> +	VM_BUG_ON_PAGE(!PageAnon(page), page);
> +	VM_BUG_ON_PAGE(!PageLocked(page), page);
> +	VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
> +
> +	if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags &
> VM_LOCKED) ||
> +	    mm_forbids_zeropage(pvmw->vma->vm_mm))
> +		return false;
> +
> +	/*
> +	 * The pmd entry mapping the old thp was flushed and the pte
> mapping
> +	 * this subpage has been non present. If the subpage is only
> zero-filled
> +	 * then map it to the shared zeropage.
> +	 */
> +	addr = kmap_local_page(page);
> +	contains_data = memchr_inv(addr, 0, PAGE_SIZE);
> +	kunmap_local(addr);
> +
> +	if (contains_data)
> +		return false;
> +
> +	newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
> +					pvmw->vma->vm_page_prot));
> +	set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte,
> newpte);
> +
> +	dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
> +	return true;
> +}
> +
> +struct rmap_walk_arg {
> +	struct folio *folio;
> +	bool map_unused_to_zeropage;
> +};
> +
>  /*
>   * Restore a potential migration pte to a working pte entry
>   */
>  static bool remove_migration_pte(struct folio *folio,
> -		struct vm_area_struct *vma, unsigned long addr, void
> *old)
> +		struct vm_area_struct *vma, unsigned long addr, void
> *arg)
>  {
> -	DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC |
> PVMW_MIGRATION);
> +	struct rmap_walk_arg *rmap_walk_arg = arg;
> +	DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr,
> PVMW_SYNC | PVMW_MIGRATION);
>  
>  	while (page_vma_mapped_walk(&pvmw)) {
>  		rmap_t rmap_flags = RMAP_NONE;
> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio
> *folio,
>  			continue;
>  		}
>  #endif
> +		if (rmap_walk_arg->map_unused_to_zeropage &&
> +		    try_to_map_unused_to_zeropage(&pvmw, folio,
> idx))
> +			continue;
>  
>  		folio_get(folio);
>  		pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio
> *folio,
>   * Get rid of all migration entries and replace them by
>   * references to the indicated page.
>   */
> -void remove_migration_ptes(struct folio *src, struct folio *dst,
> bool locked)
> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
> flags)
>  {
> +	struct rmap_walk_arg rmap_walk_arg = {
> +		.folio = src,
> +		.map_unused_to_zeropage = flags &
> RMP_USE_SHARED_ZEROPAGE,
> +	};
> +
>  	struct rmap_walk_control rwc = {
>  		.rmap_one = remove_migration_pte,
> -		.arg = src,
> +		.arg = &rmap_walk_arg,
>  	};
>  
> -	if (locked)
> +	VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src !=
> dst), src);
> +
> +	if (flags & RMP_LOCKED)
>  		rmap_walk_locked(dst, &rwc);
>  	else
>  		rmap_walk(dst, &rwc);
> @@ -934,7 +988,7 @@ static int writeout(struct address_space
> *mapping, struct folio *folio)
>  	 * At this point we know that the migration attempt cannot
>  	 * be successful.
>  	 */
> -	remove_migration_ptes(folio, folio, false);
> +	remove_migration_ptes(folio, folio, 0);
>  
>  	rc = mapping->a_ops->writepage(&folio->page, &wbc);
>  
> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio
> *src,
>  				   struct list_head *ret)
>  {
>  	if (page_was_mapped)
> -		remove_migration_ptes(src, src, false);
> +		remove_migration_ptes(src, src, 0);
>  	/* Drop an anon_vma reference if we took one */
>  	if (anon_vma)
>  		put_anon_vma(anon_vma);
> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t
> put_new_folio, unsigned long private,
>  		lru_add_drain();
>  
>  	if (old_page_state & PAGE_WAS_MAPPED)
> -		remove_migration_ptes(src, dst, false);
> +		remove_migration_ptes(src, dst, 0);
>  
>  out_unlock_both:
>  	folio_unlock(dst);
> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t
> get_new_folio,
>  
>  	if (page_was_mapped)
>  		remove_migration_ptes(src,
> -			rc == MIGRATEPAGE_SUCCESS ? dst : src,
> false);
> +			rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>  
>  unlock_put_anon:
>  	folio_unlock(dst);
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index 8d687de88a03..9cf26592ac93 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -424,7 +424,7 @@ static unsigned long
> migrate_device_unmap(unsigned long *src_pfns,
>  			continue;
>  
>  		folio = page_folio(page);
> -		remove_migration_ptes(folio, folio, false);
> +		remove_migration_ptes(folio, folio, 0);
>  
>  		src_pfns[i] = 0;
>  		folio_unlock(folio);
> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long
> *src_pfns,
>  			dst = src;
>  		}
>  
> -		remove_migration_ptes(src, dst, false);
> +		remove_migration_ptes(src, dst, 0);
>  		folio_unlock(src);
>  
>  		if (folio_is_zone_device(src))

Hi,

This patch has been in the mainline for some time, but we recently
discovered an issue when both mTHP and MTE (Memory Tagging Extension)
are enabled.

It seems that remapping to the same zeropage might causes MTE tag
mismatches, since MTE tags are associated with physical addresses.

In Android, the tombstone is as follows:

---
Build fingerprint:
'alps/vext_k6993v1_64/k6993v1_64:16/BP2A.250605.031.A3/mp1cs1ofp41:user
debug/dev-keys'
Revision: '0'
ABI: 'arm64'
Timestamp: 2025-08-12 04:58:28.507086720+0800
Process uptime: 0s
Cmdline: /system/bin/audioserver
pid: 8217, tid: 8882, name: binder:8217_4  >>> /system/bin/audioserver
<<<
uid: 1041
tagged_addr_ctrl: 000000000007fff3 (PR_TAGGED_ADDR_ENABLE,
PR_MTE_TCF_SYNC, mask 0xfffe)
signal 11 (SIGSEGV), code 9 (SEGV_MTESERR), fault addr
0x0a00007055220000
Cause: [MTE]: Buffer Overflow, 14016 bytes into a 23070-byte allocation
at 0x705521c940
    x0  0a0000705521c940  x1  0300006f75210ab0  x2  00000000000022a5 
x3  0a0000705521ffc0
    x4  0300006f75212de5  x5  0a000070552222f5  x6  0000000000005a1e 
x7  0000000000000000
    x8  339c000005a1e11e  x9  00000000f041339c  x10 000000009dd48904 
x11 000000000000ffff
    x12 0000000022b70889  x13 000000004cc0b2ff  x14 0000000000000000 
x15 0000000000000010
    x16 00000071cc5d8fc0  x17 00000071cc54e040  x18 0000006ef7bd4000 
x19 0300006f7520d430
    x20 00000071cc5e0340  x21 0000000000005a1e  x22 0a0000705521c940 
x23 00000000000059b5
    x24 00000000000000b1  x25 0300006f75212e4e  x26 caa20000059b511d 
x27 000000000000001d
    x28 0300006f75212e30  x29 0000006f1fe385f0
    lr  00000071cc54200c  sp  0000006f1fe385c0  pc  00000071cc54e158 
pst 0000000020001000

26 total frames
backtrace:
      #00 pc 000000000006c158 
/apex/com.android.runtime/lib64/bionic/libc.so
(__memcpy_aarch64_simd+280) (BuildId: 1e819f3e369d59be98bee38a8fbd0322)
      #01 pc 0000000000060008 
/apex/com.android.runtime/lib64/bionic/libc.so
(scudo::Allocator<scudo::AndroidNormalConfig,
&scudo_malloc_postinit>::reallocate(void*, unsigned long, unsigned
long)+696) (BuildId: 1e819f3e369d59be98bee38a8fbd0322)
      #02 pc 000000000005fccc 
/apex/com.android.runtime/lib64/bionic/libc.so (scudo_realloc+44)
(BuildId: 1e819f3e369d59be98bee38a8fbd0322)
      #03 pc 000000000005c2cc 
/apex/com.android.runtime/lib64/bionic/libc.so (LimitRealloc(void*,
unsigned long)+124) (BuildId: 1e819f3e369d59be98bee38a8fbd0322)
      #04 pc 0000000000059a90 
/apex/com.android.runtime/lib64/bionic/libc.so (realloc+160) (BuildId:
1e819f3e369d59be98bee38a8fbd0322)
      #05 pc 0000000000011a74  /system/lib64/libutils.so
(android::SharedBuffer::editResize(unsigned long) const+68) (BuildId:
7aa2d71e030a290c8dd28236ba0a838f)
      #06 pc 0000000000011ba8  /system/lib64/libutils.so
(android::String8::real_append(char const*, unsigned long)+88)
(BuildId: 7aa2d71e030a290c8dd28236ba0a838f)
      #07 pc 000000000007b880 
/system/lib64/libaudiopolicycomponents.so
(android::DeviceDescriptor::dump(android::String8*, int, bool)
const+208) (BuildId: 553fefffdca2f3a5dde634e123bd2c81)
      #08 pc 000000000008094c 
/system/lib64/libaudiopolicycomponents.so
(android::DeviceVector::dump(android::String8*, android::String8
const&, int, bool) const+636) (BuildId:
553fefffdca2f3a5dde634e123bd2c81)
      #09 pc 0000000000092ed4 
/system/lib64/libaudiopolicycomponents.so
(android::IOProfile::dump(android::String8*, int) const+980) (BuildId:
553fefffdca2f3a5dde634e123bd2c81)
      #10 pc 000000000008bd7c 
/system/lib64/libaudiopolicycomponents.so
(android::HwModule::dump(android::String8*, int) const+1148) (BuildId:
553fefffdca2f3a5dde634e123bd2c81)
      #11 pc 000000000009044c 
/system/lib64/libaudiopolicycomponents.so
(android::HwModuleCollection::dump(android::String8*) const+508)
(BuildId: 553fefffdca2f3a5dde634e123bd2c81)
      #12 pc 0000000000090134 
/system/lib64/libaudiopolicymanagerdefault.so
(android::AudioPolicyManager::dump(android::String8*) const+3908)
(BuildId: fdba879fc1a0c470759bfeb3d594ab81)
      #13 pc 0000000000092e40 
/system/lib64/libaudiopolicymanagerdefault.so
(android::AudioPolicyManager::dump(int)+80) (BuildId:
fdba879fc1a0c470759bfeb3d594ab81)
      #14 pc 000000000022b218  /system/bin/audioserver
(android::AudioPolicyService::dump(int,
android::Vector<android::String16> const&)+392) (BuildId:
1988c27ce74b125f598a07a93367cfdd)
      #15 pc 000000000022c8cc  /system/bin/audioserver (non-virtual
thunk to android::AudioPolicyService::dump(int,
android::Vector<android::String16> const&)+12) (BuildId:
1988c27ce74b125f598a07a93367cfdd)
      #16 pc 00000000000883f4  /system/lib64/libbinder.so
(android::BBinder::onTransact(unsigned int, android::Parcel const&,
android::Parcel*, unsigned int)+340) (BuildId:
4ace0dcb0135b71ba70b7aaee457d26f)
      #17 pc 000000000003fadc  /system/lib64/audiopolicy-aidl-cpp.so
(android::media::BnAudioPolicyService::onTransact(unsigned int,
android::Parcel const&, android::Parcel*, unsigned int)+19884)
(BuildId: ae185d80e4e54668275f262317dc2d7d)
      #18 pc 000000000022adc4  /system/bin/audioserver
(android::AudioPolicyService::onTransact(unsigned int, android::Parcel
const&, android::Parcel*, unsigned int)+1076) (BuildId:
1988c27ce74b125f598a07a93367cfdd)
      #19 pc 0000000000048adc  /system/lib64/libbinder.so
(android::IPCThreadState::executeCommand(int)+748) (BuildId:
4ace0dcb0135b71ba70b7aaee457d26f)
      #20 pc 0000000000051788  /system/lib64/libbinder.so
(android::IPCThreadState::joinThreadPool(bool)+296) (BuildId:
4ace0dcb0135b71ba70b7aaee457d26f)
      #21 pc 000000000007e528  /system/lib64/libbinder.so
(android::PoolThread::threadLoop()+24) (BuildId:
4ace0dcb0135b71ba70b7aaee457d26f)
      #22 pc 0000000000019268  /system/lib64/libutils.so
(android::Thread::_threadLoop(void*)+248) (BuildId:
7aa2d71e030a290c8dd28236ba0a838f)
      #23 pc 000000000001b994  /system/lib64/libutils.so
(libutil_thread_trampoline(void*)
(.__uniq.226528677032898775202282855395389835431)+20) (BuildId:
7aa2d71e030a290c8dd28236ba0a838f)
      #24 pc 0000000000083c8c 
/apex/com.android.runtime/lib64/bionic/libc.so
(__pthread_start(void*)+236) (BuildId:
1e819f3e369d59be98bee38a8fbd0322)
      #25 pc 00000000000761a0 
/apex/com.android.runtime/lib64/bionic/libc.so (__start_thread+64)
(BuildId: 1e819f3e369d59be98bee38a8fbd0322)

Memory tags around the fault address (0xa00007055220000), one tag per
16 bytes:
      0x705521f800: a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a
      0x705521f900: a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a
      0x705521fa00: a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a
      0x705521fb00: a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a
      0x705521fc00: a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a
      0x705521fd00: a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a
      0x705521fe00: a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a
      0x705521ff00: a  a  a  a  a  a  a  a  a  a  a  a  a  a  a  a
    =>0x7055220000:[0] 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
      0x7055220100: 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
      0x7055220200: 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
      0x7055220300: 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
      0x7055220400: 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
      0x7055220500: 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
      0x7055220600: 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
      0x7055220700: 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
---

Whenever the memory pressure is high, it will happen to any process
with MTE enabled.

Any suggestion is appreciated.

Thanks,
Qun-wei

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by David Hildenbrand 4 months, 3 weeks ago

On 18.09.25 10:53, Qun-wei Lin (林群崴) wrote:
> On Fri, 2024-08-30 at 11:03 +0100, Usama Arif wrote:
>> From: Yu Zhao <yuzhao@google.com>
>>
>> Here being unused means containing only zeros and inaccessible to
>> userspace. When splitting an isolated thp under reclaim or migration,
>> the unused subpages can be mapped to the shared zeropage, hence
>> saving
>> memory. This is particularly helpful when the internal
>> fragmentation of a thp is high, i.e. it has many untouched subpages.
>>
>> This is also a prerequisite for THP low utilization shrinker which
>> will
>> be introduced in later patches, where underutilized THPs are split,
>> and
>> the zero-filled pages are freed saving memory.
>>
>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>> Tested-by: Shuang Zhai <zhais@google.com>
>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>> ---
>>   include/linux/rmap.h |  7 ++++-
>>   mm/huge_memory.c     |  8 ++---
>>   mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++----
>> --
>>   mm/migrate_device.c  |  4 +--
>>   4 files changed, 75 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>> index 91b5935e8485..d5e93e44322e 100644
>> --- a/include/linux/rmap.h
>> +++ b/include/linux/rmap.h
>> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>>   int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages,
>> pgoff_t pgoff,
>>   		      struct vm_area_struct *vma);
>>   
>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>> bool locked);
>> +enum rmp_flags {
>> +	RMP_LOCKED		= 1 << 0,
>> +	RMP_USE_SHARED_ZEROPAGE	= 1 << 1,
>> +};
>> +
>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>> flags);
>>   
>>   /*
>>    * rmap_walk_control: To control rmap traversing for specific needs
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 0c48806ccb9a..af60684e7c70 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct
>> vm_area_struct *vma, unsigned long addr,
>>   	return false;
>>   }
>>   
>> -static void remap_page(struct folio *folio, unsigned long nr)
>> +static void remap_page(struct folio *folio, unsigned long nr, int
>> flags)
>>   {
>>   	int i = 0;
>>   
>> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio,
>> unsigned long nr)
>>   	if (!folio_test_anon(folio))
>>   		return;
>>   	for (;;) {
>> -		remove_migration_ptes(folio, folio, true);
>> +		remove_migration_ptes(folio, folio, RMP_LOCKED |
>> flags);
>>   		i += folio_nr_pages(folio);
>>   		if (i >= nr)
>>   			break;
>> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page
>> *page, struct list_head *list,
>>   
>>   	if (nr_dropped)
>>   		shmem_uncharge(folio->mapping->host, nr_dropped);
>> -	remap_page(folio, nr);
>> +	remap_page(folio, nr, PageAnon(head) ?
>> RMP_USE_SHARED_ZEROPAGE : 0);
>>   
>>   	/*
>>   	 * set page to its compound_head when split to non order-0
>> pages, so
>> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct
>> page *page, struct list_head *list,
>>   		if (mapping)
>>   			xas_unlock(&xas);
>>   		local_irq_enable();
>> -		remap_page(folio, folio_nr_pages(folio));
>> +		remap_page(folio, folio_nr_pages(folio), 0);
>>   		ret = -EAGAIN;
>>   	}
>>   
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 6f9c62c746be..d039863e014b 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio,
>> struct list_head *list)
>>   	return true;
>>   }
>>   
>> +static bool try_to_map_unused_to_zeropage(struct
>> page_vma_mapped_walk *pvmw,
>> +					  struct folio *folio,
>> +					  unsigned long idx)
>> +{
>> +	struct page *page = folio_page(folio, idx);
>> +	bool contains_data;
>> +	pte_t newpte;
>> +	void *addr;
>> +
>> +	VM_BUG_ON_PAGE(PageCompound(page), page);
>> +	VM_BUG_ON_PAGE(!PageAnon(page), page);
>> +	VM_BUG_ON_PAGE(!PageLocked(page), page);
>> +	VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>> +
>> +	if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags &
>> VM_LOCKED) ||
>> +	    mm_forbids_zeropage(pvmw->vma->vm_mm))
>> +		return false;
>> +
>> +	/*
>> +	 * The pmd entry mapping the old thp was flushed and the pte
>> mapping
>> +	 * this subpage has been non present. If the subpage is only
>> zero-filled
>> +	 * then map it to the shared zeropage.
>> +	 */
>> +	addr = kmap_local_page(page);
>> +	contains_data = memchr_inv(addr, 0, PAGE_SIZE);
>> +	kunmap_local(addr);
>> +
>> +	if (contains_data)
>> +		return false;
>> +
>> +	newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
>> +					pvmw->vma->vm_page_prot));
>> +	set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte,
>> newpte);
>> +
>> +	dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
>> +	return true;
>> +}
>> +
>> +struct rmap_walk_arg {
>> +	struct folio *folio;
>> +	bool map_unused_to_zeropage;
>> +};
>> +
>>   /*
>>    * Restore a potential migration pte to a working pte entry
>>    */
>>   static bool remove_migration_pte(struct folio *folio,
>> -		struct vm_area_struct *vma, unsigned long addr, void
>> *old)
>> +		struct vm_area_struct *vma, unsigned long addr, void
>> *arg)
>>   {
>> -	DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC |
>> PVMW_MIGRATION);
>> +	struct rmap_walk_arg *rmap_walk_arg = arg;
>> +	DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr,
>> PVMW_SYNC | PVMW_MIGRATION);
>>   
>>   	while (page_vma_mapped_walk(&pvmw)) {
>>   		rmap_t rmap_flags = RMAP_NONE;
>> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio
>> *folio,
>>   			continue;
>>   		}
>>   #endif
>> +		if (rmap_walk_arg->map_unused_to_zeropage &&
>> +		    try_to_map_unused_to_zeropage(&pvmw, folio,
>> idx))
>> +			continue;
>>   
>>   		folio_get(folio);
>>   		pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
>> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio
>> *folio,
>>    * Get rid of all migration entries and replace them by
>>    * references to the indicated page.
>>    */
>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>> bool locked)
>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>> flags)
>>   {
>> +	struct rmap_walk_arg rmap_walk_arg = {
>> +		.folio = src,
>> +		.map_unused_to_zeropage = flags &
>> RMP_USE_SHARED_ZEROPAGE,
>> +	};
>> +
>>   	struct rmap_walk_control rwc = {
>>   		.rmap_one = remove_migration_pte,
>> -		.arg = src,
>> +		.arg = &rmap_walk_arg,
>>   	};
>>   
>> -	if (locked)
>> +	VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src !=
>> dst), src);
>> +
>> +	if (flags & RMP_LOCKED)
>>   		rmap_walk_locked(dst, &rwc);
>>   	else
>>   		rmap_walk(dst, &rwc);
>> @@ -934,7 +988,7 @@ static int writeout(struct address_space
>> *mapping, struct folio *folio)
>>   	 * At this point we know that the migration attempt cannot
>>   	 * be successful.
>>   	 */
>> -	remove_migration_ptes(folio, folio, false);
>> +	remove_migration_ptes(folio, folio, 0);
>>   
>>   	rc = mapping->a_ops->writepage(&folio->page, &wbc);
>>   
>> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio
>> *src,
>>   				   struct list_head *ret)
>>   {
>>   	if (page_was_mapped)
>> -		remove_migration_ptes(src, src, false);
>> +		remove_migration_ptes(src, src, 0);
>>   	/* Drop an anon_vma reference if we took one */
>>   	if (anon_vma)
>>   		put_anon_vma(anon_vma);
>> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t
>> put_new_folio, unsigned long private,
>>   		lru_add_drain();
>>   
>>   	if (old_page_state & PAGE_WAS_MAPPED)
>> -		remove_migration_ptes(src, dst, false);
>> +		remove_migration_ptes(src, dst, 0);
>>   
>>   out_unlock_both:
>>   	folio_unlock(dst);
>> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t
>> get_new_folio,
>>   
>>   	if (page_was_mapped)
>>   		remove_migration_ptes(src,
>> -			rc == MIGRATEPAGE_SUCCESS ? dst : src,
>> false);
>> +			rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>>   
>>   unlock_put_anon:
>>   	folio_unlock(dst);
>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> index 8d687de88a03..9cf26592ac93 100644
>> --- a/mm/migrate_device.c
>> +++ b/mm/migrate_device.c
>> @@ -424,7 +424,7 @@ static unsigned long
>> migrate_device_unmap(unsigned long *src_pfns,
>>   			continue;
>>   
>>   		folio = page_folio(page);
>> -		remove_migration_ptes(folio, folio, false);
>> +		remove_migration_ptes(folio, folio, 0);
>>   
>>   		src_pfns[i] = 0;
>>   		folio_unlock(folio);
>> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long
>> *src_pfns,
>>   			dst = src;
>>   		}
>>   
>> -		remove_migration_ptes(src, dst, false);
>> +		remove_migration_ptes(src, dst, 0);
>>   		folio_unlock(src);
>>   
>>   		if (folio_is_zone_device(src))
> 
> Hi,
> 
> This patch has been in the mainline for some time, but we recently
> discovered an issue when both mTHP and MTE (Memory Tagging Extension)
> are enabled.
> 
> It seems that remapping to the same zeropage might causes MTE tag
> mismatches, since MTE tags are associated with physical addresses.

Does this only trigger when the VMA has mte enabled? Maybe we'll have to 
bail out if we detect that mte is enabled.

Also, I wonder how KSM and the shared zeropage works in general with 
that, because I would expect similar issues when we de-duplicate memory?

-- 
Cheers

David / dhildenb

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by Lance Yang 4 months, 3 weeks ago

On Thu, Sep 18, 2025 at 5:21 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 18.09.25 10:53, Qun-wei Lin (林群崴) wrote:
> > On Fri, 2024-08-30 at 11:03 +0100, Usama Arif wrote:
> >> From: Yu Zhao <yuzhao@google.com>
> >>
> >> Here being unused means containing only zeros and inaccessible to
> >> userspace. When splitting an isolated thp under reclaim or migration,
> >> the unused subpages can be mapped to the shared zeropage, hence
> >> saving
> >> memory. This is particularly helpful when the internal
> >> fragmentation of a thp is high, i.e. it has many untouched subpages.
> >>
> >> This is also a prerequisite for THP low utilization shrinker which
> >> will
> >> be introduced in later patches, where underutilized THPs are split,
> >> and
> >> the zero-filled pages are freed saving memory.
> >>
> >> Signed-off-by: Yu Zhao <yuzhao@google.com>
> >> Tested-by: Shuang Zhai <zhais@google.com>
> >> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >> ---
> >>   include/linux/rmap.h |  7 ++++-
> >>   mm/huge_memory.c     |  8 ++---
> >>   mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++----
> >> --
> >>   mm/migrate_device.c  |  4 +--
> >>   4 files changed, 75 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> >> index 91b5935e8485..d5e93e44322e 100644
> >> --- a/include/linux/rmap.h
> >> +++ b/include/linux/rmap.h
> >> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
> >>   int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages,
> >> pgoff_t pgoff,
> >>                    struct vm_area_struct *vma);
> >>
> >> -void remove_migration_ptes(struct folio *src, struct folio *dst,
> >> bool locked);
> >> +enum rmp_flags {
> >> +    RMP_LOCKED              = 1 << 0,
> >> +    RMP_USE_SHARED_ZEROPAGE = 1 << 1,
> >> +};
> >> +
> >> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
> >> flags);
> >>
> >>   /*
> >>    * rmap_walk_control: To control rmap traversing for specific needs
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 0c48806ccb9a..af60684e7c70 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct
> >> vm_area_struct *vma, unsigned long addr,
> >>      return false;
> >>   }
> >>
> >> -static void remap_page(struct folio *folio, unsigned long nr)
> >> +static void remap_page(struct folio *folio, unsigned long nr, int
> >> flags)
> >>   {
> >>      int i = 0;
> >>
> >> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio,
> >> unsigned long nr)
> >>      if (!folio_test_anon(folio))
> >>              return;
> >>      for (;;) {
> >> -            remove_migration_ptes(folio, folio, true);
> >> +            remove_migration_ptes(folio, folio, RMP_LOCKED |
> >> flags);
> >>              i += folio_nr_pages(folio);
> >>              if (i >= nr)
> >>                      break;
> >> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page
> >> *page, struct list_head *list,
> >>
> >>      if (nr_dropped)
> >>              shmem_uncharge(folio->mapping->host, nr_dropped);
> >> -    remap_page(folio, nr);
> >> +    remap_page(folio, nr, PageAnon(head) ?
> >> RMP_USE_SHARED_ZEROPAGE : 0);
> >>
> >>      /*
> >>       * set page to its compound_head when split to non order-0
> >> pages, so
> >> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct
> >> page *page, struct list_head *list,
> >>              if (mapping)
> >>                      xas_unlock(&xas);
> >>              local_irq_enable();
> >> -            remap_page(folio, folio_nr_pages(folio));
> >> +            remap_page(folio, folio_nr_pages(folio), 0);
> >>              ret = -EAGAIN;
> >>      }
> >>
> >> diff --git a/mm/migrate.c b/mm/migrate.c
> >> index 6f9c62c746be..d039863e014b 100644
> >> --- a/mm/migrate.c
> >> +++ b/mm/migrate.c
> >> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio,
> >> struct list_head *list)
> >>      return true;
> >>   }
> >>
> >> +static bool try_to_map_unused_to_zeropage(struct
> >> page_vma_mapped_walk *pvmw,
> >> +                                      struct folio *folio,
> >> +                                      unsigned long idx)
> >> +{
> >> +    struct page *page = folio_page(folio, idx);
> >> +    bool contains_data;
> >> +    pte_t newpte;
> >> +    void *addr;
> >> +
> >> +    VM_BUG_ON_PAGE(PageCompound(page), page);
> >> +    VM_BUG_ON_PAGE(!PageAnon(page), page);
> >> +    VM_BUG_ON_PAGE(!PageLocked(page), page);
> >> +    VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
> >> +
> >> +    if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags &
> >> VM_LOCKED) ||
> >> +        mm_forbids_zeropage(pvmw->vma->vm_mm))
> >> +            return false;
> >> +
> >> +    /*
> >> +     * The pmd entry mapping the old thp was flushed and the pte
> >> mapping
> >> +     * this subpage has been non present. If the subpage is only
> >> zero-filled
> >> +     * then map it to the shared zeropage.
> >> +     */
> >> +    addr = kmap_local_page(page);
> >> +    contains_data = memchr_inv(addr, 0, PAGE_SIZE);
> >> +    kunmap_local(addr);
> >> +
> >> +    if (contains_data)
> >> +            return false;
> >> +
> >> +    newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
> >> +                                    pvmw->vma->vm_page_prot));
> >> +    set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte,
> >> newpte);
> >> +
> >> +    dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
> >> +    return true;
> >> +}
> >> +
> >> +struct rmap_walk_arg {
> >> +    struct folio *folio;
> >> +    bool map_unused_to_zeropage;
> >> +};
> >> +
> >>   /*
> >>    * Restore a potential migration pte to a working pte entry
> >>    */
> >>   static bool remove_migration_pte(struct folio *folio,
> >> -            struct vm_area_struct *vma, unsigned long addr, void
> >> *old)
> >> +            struct vm_area_struct *vma, unsigned long addr, void
> >> *arg)
> >>   {
> >> -    DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC |
> >> PVMW_MIGRATION);
> >> +    struct rmap_walk_arg *rmap_walk_arg = arg;
> >> +    DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr,
> >> PVMW_SYNC | PVMW_MIGRATION);
> >>
> >>      while (page_vma_mapped_walk(&pvmw)) {
> >>              rmap_t rmap_flags = RMAP_NONE;
> >> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio
> >> *folio,
> >>                      continue;
> >>              }
> >>   #endif
> >> +            if (rmap_walk_arg->map_unused_to_zeropage &&
> >> +                try_to_map_unused_to_zeropage(&pvmw, folio,
> >> idx))
> >> +                    continue;
> >>
> >>              folio_get(folio);
> >>              pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
> >> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio
> >> *folio,
> >>    * Get rid of all migration entries and replace them by
> >>    * references to the indicated page.
> >>    */
> >> -void remove_migration_ptes(struct folio *src, struct folio *dst,
> >> bool locked)
> >> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
> >> flags)
> >>   {
> >> +    struct rmap_walk_arg rmap_walk_arg = {
> >> +            .folio = src,
> >> +            .map_unused_to_zeropage = flags &
> >> RMP_USE_SHARED_ZEROPAGE,
> >> +    };
> >> +
> >>      struct rmap_walk_control rwc = {
> >>              .rmap_one = remove_migration_pte,
> >> -            .arg = src,
> >> +            .arg = &rmap_walk_arg,
> >>      };
> >>
> >> -    if (locked)
> >> +    VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src !=
> >> dst), src);
> >> +
> >> +    if (flags & RMP_LOCKED)
> >>              rmap_walk_locked(dst, &rwc);
> >>      else
> >>              rmap_walk(dst, &rwc);
> >> @@ -934,7 +988,7 @@ static int writeout(struct address_space
> >> *mapping, struct folio *folio)
> >>       * At this point we know that the migration attempt cannot
> >>       * be successful.
> >>       */
> >> -    remove_migration_ptes(folio, folio, false);
> >> +    remove_migration_ptes(folio, folio, 0);
> >>
> >>      rc = mapping->a_ops->writepage(&folio->page, &wbc);
> >>
> >> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio
> >> *src,
> >>                                 struct list_head *ret)
> >>   {
> >>      if (page_was_mapped)
> >> -            remove_migration_ptes(src, src, false);
> >> +            remove_migration_ptes(src, src, 0);
> >>      /* Drop an anon_vma reference if we took one */
> >>      if (anon_vma)
> >>              put_anon_vma(anon_vma);
> >> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t
> >> put_new_folio, unsigned long private,
> >>              lru_add_drain();
> >>
> >>      if (old_page_state & PAGE_WAS_MAPPED)
> >> -            remove_migration_ptes(src, dst, false);
> >> +            remove_migration_ptes(src, dst, 0);
> >>
> >>   out_unlock_both:
> >>      folio_unlock(dst);
> >> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t
> >> get_new_folio,
> >>
> >>      if (page_was_mapped)
> >>              remove_migration_ptes(src,
> >> -                    rc == MIGRATEPAGE_SUCCESS ? dst : src,
> >> false);
> >> +                    rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
> >>
> >>   unlock_put_anon:
> >>      folio_unlock(dst);
> >> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> >> index 8d687de88a03..9cf26592ac93 100644
> >> --- a/mm/migrate_device.c
> >> +++ b/mm/migrate_device.c
> >> @@ -424,7 +424,7 @@ static unsigned long
> >> migrate_device_unmap(unsigned long *src_pfns,
> >>                      continue;
> >>
> >>              folio = page_folio(page);
> >> -            remove_migration_ptes(folio, folio, false);
> >> +            remove_migration_ptes(folio, folio, 0);
> >>
> >>              src_pfns[i] = 0;
> >>              folio_unlock(folio);
> >> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long
> >> *src_pfns,
> >>                      dst = src;
> >>              }
> >>
> >> -            remove_migration_ptes(src, dst, false);
> >> +            remove_migration_ptes(src, dst, 0);
> >>              folio_unlock(src);
> >>
> >>              if (folio_is_zone_device(src))
> >
> > Hi,
> >
> > This patch has been in the mainline for some time, but we recently
> > discovered an issue when both mTHP and MTE (Memory Tagging Extension)
> > are enabled.
> >
> > It seems that remapping to the same zeropage might causes MTE tag
> > mismatches, since MTE tags are associated with physical addresses.
>
> Does this only trigger when the VMA has mte enabled? Maybe we'll have to
> bail out if we detect that mte is enabled.

It seems RISC-V also has a similar feature (RISCV_ISA_SUPM) that uses
the same prctl(PR_{GET,SET}_TAGGED_ADDR_CTRL) API.

config RISCV_ISA_SUPM
        bool "Supm extension for userspace pointer masking"
        depends on 64BIT
        default y
        help
          Add support for pointer masking in userspace (Supm) when the
          underlying hardware extension (Smnpm or Ssnpm) is detected at boot.

          If this option is disabled, userspace will be unable to use
          the prctl(PR_{SET,GET}_TAGGED_ADDR_CTRL) API.

I wonder if we should disable the THP shrinker for such architectures that
define PR_SET_TAGGED_ADDR_CTRL (or PR_GET_TAGGED_ADDR_CTRL).

Cheers,
Lance

>
> Also, I wonder how KSM and the shared zeropage works in general with
> that, because I would expect similar issues when we de-duplicate memory?
>
> --
> Cheers
>
> David / dhildenb
>
>

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by David Hildenbrand 4 months, 3 weeks ago

On 18.09.25 14:22, Lance Yang wrote:
> On Thu, Sep 18, 2025 at 5:21 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 18.09.25 10:53, Qun-wei Lin (林群崴) wrote:
>>> On Fri, 2024-08-30 at 11:03 +0100, Usama Arif wrote:
>>>> From: Yu Zhao <yuzhao@google.com>
>>>>
>>>> Here being unused means containing only zeros and inaccessible to
>>>> userspace. When splitting an isolated thp under reclaim or migration,
>>>> the unused subpages can be mapped to the shared zeropage, hence
>>>> saving
>>>> memory. This is particularly helpful when the internal
>>>> fragmentation of a thp is high, i.e. it has many untouched subpages.
>>>>
>>>> This is also a prerequisite for THP low utilization shrinker which
>>>> will
>>>> be introduced in later patches, where underutilized THPs are split,
>>>> and
>>>> the zero-filled pages are freed saving memory.
>>>>
>>>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>>>> Tested-by: Shuang Zhai <zhais@google.com>
>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>> ---
>>>>    include/linux/rmap.h |  7 ++++-
>>>>    mm/huge_memory.c     |  8 ++---
>>>>    mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++----
>>>> --
>>>>    mm/migrate_device.c  |  4 +--
>>>>    4 files changed, 75 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>>> index 91b5935e8485..d5e93e44322e 100644
>>>> --- a/include/linux/rmap.h
>>>> +++ b/include/linux/rmap.h
>>>> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>>>>    int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages,
>>>> pgoff_t pgoff,
>>>>                     struct vm_area_struct *vma);
>>>>
>>>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>>>> bool locked);
>>>> +enum rmp_flags {
>>>> +    RMP_LOCKED              = 1 << 0,
>>>> +    RMP_USE_SHARED_ZEROPAGE = 1 << 1,
>>>> +};
>>>> +
>>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>>>> flags);
>>>>
>>>>    /*
>>>>     * rmap_walk_control: To control rmap traversing for specific needs
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 0c48806ccb9a..af60684e7c70 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct
>>>> vm_area_struct *vma, unsigned long addr,
>>>>       return false;
>>>>    }
>>>>
>>>> -static void remap_page(struct folio *folio, unsigned long nr)
>>>> +static void remap_page(struct folio *folio, unsigned long nr, int
>>>> flags)
>>>>    {
>>>>       int i = 0;
>>>>
>>>> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio,
>>>> unsigned long nr)
>>>>       if (!folio_test_anon(folio))
>>>>               return;
>>>>       for (;;) {
>>>> -            remove_migration_ptes(folio, folio, true);
>>>> +            remove_migration_ptes(folio, folio, RMP_LOCKED |
>>>> flags);
>>>>               i += folio_nr_pages(folio);
>>>>               if (i >= nr)
>>>>                       break;
>>>> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page
>>>> *page, struct list_head *list,
>>>>
>>>>       if (nr_dropped)
>>>>               shmem_uncharge(folio->mapping->host, nr_dropped);
>>>> -    remap_page(folio, nr);
>>>> +    remap_page(folio, nr, PageAnon(head) ?
>>>> RMP_USE_SHARED_ZEROPAGE : 0);
>>>>
>>>>       /*
>>>>        * set page to its compound_head when split to non order-0
>>>> pages, so
>>>> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct
>>>> page *page, struct list_head *list,
>>>>               if (mapping)
>>>>                       xas_unlock(&xas);
>>>>               local_irq_enable();
>>>> -            remap_page(folio, folio_nr_pages(folio));
>>>> +            remap_page(folio, folio_nr_pages(folio), 0);
>>>>               ret = -EAGAIN;
>>>>       }
>>>>
>>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>>> index 6f9c62c746be..d039863e014b 100644
>>>> --- a/mm/migrate.c
>>>> +++ b/mm/migrate.c
>>>> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio,
>>>> struct list_head *list)
>>>>       return true;
>>>>    }
>>>>
>>>> +static bool try_to_map_unused_to_zeropage(struct
>>>> page_vma_mapped_walk *pvmw,
>>>> +                                      struct folio *folio,
>>>> +                                      unsigned long idx)
>>>> +{
>>>> +    struct page *page = folio_page(folio, idx);
>>>> +    bool contains_data;
>>>> +    pte_t newpte;
>>>> +    void *addr;
>>>> +
>>>> +    VM_BUG_ON_PAGE(PageCompound(page), page);
>>>> +    VM_BUG_ON_PAGE(!PageAnon(page), page);
>>>> +    VM_BUG_ON_PAGE(!PageLocked(page), page);
>>>> +    VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>>>> +
>>>> +    if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags &
>>>> VM_LOCKED) ||
>>>> +        mm_forbids_zeropage(pvmw->vma->vm_mm))
>>>> +            return false;
>>>> +
>>>> +    /*
>>>> +     * The pmd entry mapping the old thp was flushed and the pte
>>>> mapping
>>>> +     * this subpage has been non present. If the subpage is only
>>>> zero-filled
>>>> +     * then map it to the shared zeropage.
>>>> +     */
>>>> +    addr = kmap_local_page(page);
>>>> +    contains_data = memchr_inv(addr, 0, PAGE_SIZE);
>>>> +    kunmap_local(addr);
>>>> +
>>>> +    if (contains_data)
>>>> +            return false;
>>>> +
>>>> +    newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
>>>> +                                    pvmw->vma->vm_page_prot));
>>>> +    set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte,
>>>> newpte);
>>>> +
>>>> +    dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
>>>> +    return true;
>>>> +}
>>>> +
>>>> +struct rmap_walk_arg {
>>>> +    struct folio *folio;
>>>> +    bool map_unused_to_zeropage;
>>>> +};
>>>> +
>>>>    /*
>>>>     * Restore a potential migration pte to a working pte entry
>>>>     */
>>>>    static bool remove_migration_pte(struct folio *folio,
>>>> -            struct vm_area_struct *vma, unsigned long addr, void
>>>> *old)
>>>> +            struct vm_area_struct *vma, unsigned long addr, void
>>>> *arg)
>>>>    {
>>>> -    DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC |
>>>> PVMW_MIGRATION);
>>>> +    struct rmap_walk_arg *rmap_walk_arg = arg;
>>>> +    DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr,
>>>> PVMW_SYNC | PVMW_MIGRATION);
>>>>
>>>>       while (page_vma_mapped_walk(&pvmw)) {
>>>>               rmap_t rmap_flags = RMAP_NONE;
>>>> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio
>>>> *folio,
>>>>                       continue;
>>>>               }
>>>>    #endif
>>>> +            if (rmap_walk_arg->map_unused_to_zeropage &&
>>>> +                try_to_map_unused_to_zeropage(&pvmw, folio,
>>>> idx))
>>>> +                    continue;
>>>>
>>>>               folio_get(folio);
>>>>               pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
>>>> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio
>>>> *folio,
>>>>     * Get rid of all migration entries and replace them by
>>>>     * references to the indicated page.
>>>>     */
>>>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>>>> bool locked)
>>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>>>> flags)
>>>>    {
>>>> +    struct rmap_walk_arg rmap_walk_arg = {
>>>> +            .folio = src,
>>>> +            .map_unused_to_zeropage = flags &
>>>> RMP_USE_SHARED_ZEROPAGE,
>>>> +    };
>>>> +
>>>>       struct rmap_walk_control rwc = {
>>>>               .rmap_one = remove_migration_pte,
>>>> -            .arg = src,
>>>> +            .arg = &rmap_walk_arg,
>>>>       };
>>>>
>>>> -    if (locked)
>>>> +    VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src !=
>>>> dst), src);
>>>> +
>>>> +    if (flags & RMP_LOCKED)
>>>>               rmap_walk_locked(dst, &rwc);
>>>>       else
>>>>               rmap_walk(dst, &rwc);
>>>> @@ -934,7 +988,7 @@ static int writeout(struct address_space
>>>> *mapping, struct folio *folio)
>>>>        * At this point we know that the migration attempt cannot
>>>>        * be successful.
>>>>        */
>>>> -    remove_migration_ptes(folio, folio, false);
>>>> +    remove_migration_ptes(folio, folio, 0);
>>>>
>>>>       rc = mapping->a_ops->writepage(&folio->page, &wbc);
>>>>
>>>> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio
>>>> *src,
>>>>                                  struct list_head *ret)
>>>>    {
>>>>       if (page_was_mapped)
>>>> -            remove_migration_ptes(src, src, false);
>>>> +            remove_migration_ptes(src, src, 0);
>>>>       /* Drop an anon_vma reference if we took one */
>>>>       if (anon_vma)
>>>>               put_anon_vma(anon_vma);
>>>> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t
>>>> put_new_folio, unsigned long private,
>>>>               lru_add_drain();
>>>>
>>>>       if (old_page_state & PAGE_WAS_MAPPED)
>>>> -            remove_migration_ptes(src, dst, false);
>>>> +            remove_migration_ptes(src, dst, 0);
>>>>
>>>>    out_unlock_both:
>>>>       folio_unlock(dst);
>>>> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t
>>>> get_new_folio,
>>>>
>>>>       if (page_was_mapped)
>>>>               remove_migration_ptes(src,
>>>> -                    rc == MIGRATEPAGE_SUCCESS ? dst : src,
>>>> false);
>>>> +                    rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>>>>
>>>>    unlock_put_anon:
>>>>       folio_unlock(dst);
>>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>>> index 8d687de88a03..9cf26592ac93 100644
>>>> --- a/mm/migrate_device.c
>>>> +++ b/mm/migrate_device.c
>>>> @@ -424,7 +424,7 @@ static unsigned long
>>>> migrate_device_unmap(unsigned long *src_pfns,
>>>>                       continue;
>>>>
>>>>               folio = page_folio(page);
>>>> -            remove_migration_ptes(folio, folio, false);
>>>> +            remove_migration_ptes(folio, folio, 0);
>>>>
>>>>               src_pfns[i] = 0;
>>>>               folio_unlock(folio);
>>>> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long
>>>> *src_pfns,
>>>>                       dst = src;
>>>>               }
>>>>
>>>> -            remove_migration_ptes(src, dst, false);
>>>> +            remove_migration_ptes(src, dst, 0);
>>>>               folio_unlock(src);
>>>>
>>>>               if (folio_is_zone_device(src))
>>>
>>> Hi,
>>>
>>> This patch has been in the mainline for some time, but we recently
>>> discovered an issue when both mTHP and MTE (Memory Tagging Extension)
>>> are enabled.
>>>
>>> It seems that remapping to the same zeropage might causes MTE tag
>>> mismatches, since MTE tags are associated with physical addresses.
>>
>> Does this only trigger when the VMA has mte enabled? Maybe we'll have to
>> bail out if we detect that mte is enabled.
> 
> It seems RISC-V also has a similar feature (RISCV_ISA_SUPM) that uses
> the same prctl(PR_{GET,SET}_TAGGED_ADDR_CTRL) API.
> 
> config RISCV_ISA_SUPM
>          bool "Supm extension for userspace pointer masking"
>          depends on 64BIT
>          default y
>          help
>            Add support for pointer masking in userspace (Supm) when the
>            underlying hardware extension (Smnpm or Ssnpm) is detected at boot.
> 
>            If this option is disabled, userspace will be unable to use
>            the prctl(PR_{SET,GET}_TAGGED_ADDR_CTRL) API.
> 
> I wonder if we should disable the THP shrinker for such architectures that

I think where possible we really only want to identify problematic 
(tagged) pages and skip them. And we should either look into fixing KSM 
as well or finding out why KSM is not affected.

-- 
Cheers

David / dhildenb

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by Lance Yang 4 months, 3 weeks ago


On 2025/9/18 20:35, David Hildenbrand wrote:
> On 18.09.25 14:22, Lance Yang wrote:
>> On Thu, Sep 18, 2025 at 5:21 PM David Hildenbrand <david@redhat.com> 
>> wrote:
>>>
>>> On 18.09.25 10:53, Qun-wei Lin (林群崴) wrote:
>>>> On Fri, 2024-08-30 at 11:03 +0100, Usama Arif wrote:
>>>>> From: Yu Zhao <yuzhao@google.com>
>>>>>
>>>>> Here being unused means containing only zeros and inaccessible to
>>>>> userspace. When splitting an isolated thp under reclaim or migration,
>>>>> the unused subpages can be mapped to the shared zeropage, hence
>>>>> saving
>>>>> memory. This is particularly helpful when the internal
>>>>> fragmentation of a thp is high, i.e. it has many untouched subpages.
>>>>>
>>>>> This is also a prerequisite for THP low utilization shrinker which
>>>>> will
>>>>> be introduced in later patches, where underutilized THPs are split,
>>>>> and
>>>>> the zero-filled pages are freed saving memory.
>>>>>
>>>>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>>>>> Tested-by: Shuang Zhai <zhais@google.com>
>>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>>> ---
>>>>>    include/linux/rmap.h |  7 ++++-
>>>>>    mm/huge_memory.c     |  8 ++---
>>>>>    mm/migrate.c         | 72 +++++++++++++++++++++++++++++++++++++ 
>>>>> +----
>>>>> -- 
>>>>>    mm/migrate_device.c  |  4 +--
>>>>>    4 files changed, 75 insertions(+), 16 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>>>> index 91b5935e8485..d5e93e44322e 100644
>>>>> --- a/include/linux/rmap.h
>>>>> +++ b/include/linux/rmap.h
>>>>> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>>>>>    int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages,
>>>>> pgoff_t pgoff,
>>>>>                     struct vm_area_struct *vma);
>>>>>
>>>>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>>>>> bool locked);
>>>>> +enum rmp_flags {
>>>>> +    RMP_LOCKED              = 1 << 0,
>>>>> +    RMP_USE_SHARED_ZEROPAGE = 1 << 1,
>>>>> +};
>>>>> +
>>>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>>>>> flags);
>>>>>
>>>>>    /*
>>>>>     * rmap_walk_control: To control rmap traversing for specific needs
>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>> index 0c48806ccb9a..af60684e7c70 100644
>>>>> --- a/mm/huge_memory.c
>>>>> +++ b/mm/huge_memory.c
>>>>> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct
>>>>> vm_area_struct *vma, unsigned long addr,
>>>>>       return false;
>>>>>    }
>>>>>
>>>>> -static void remap_page(struct folio *folio, unsigned long nr)
>>>>> +static void remap_page(struct folio *folio, unsigned long nr, int
>>>>> flags)
>>>>>    {
>>>>>       int i = 0;
>>>>>
>>>>> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio,
>>>>> unsigned long nr)
>>>>>       if (!folio_test_anon(folio))
>>>>>               return;
>>>>>       for (;;) {
>>>>> -            remove_migration_ptes(folio, folio, true);
>>>>> +            remove_migration_ptes(folio, folio, RMP_LOCKED |
>>>>> flags);
>>>>>               i += folio_nr_pages(folio);
>>>>>               if (i >= nr)
>>>>>                       break;
>>>>> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page
>>>>> *page, struct list_head *list,
>>>>>
>>>>>       if (nr_dropped)
>>>>>               shmem_uncharge(folio->mapping->host, nr_dropped);
>>>>> -    remap_page(folio, nr);
>>>>> +    remap_page(folio, nr, PageAnon(head) ?
>>>>> RMP_USE_SHARED_ZEROPAGE : 0);
>>>>>
>>>>>       /*
>>>>>        * set page to its compound_head when split to non order-0
>>>>> pages, so
>>>>> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct
>>>>> page *page, struct list_head *list,
>>>>>               if (mapping)
>>>>>                       xas_unlock(&xas);
>>>>>               local_irq_enable();
>>>>> -            remap_page(folio, folio_nr_pages(folio));
>>>>> +            remap_page(folio, folio_nr_pages(folio), 0);
>>>>>               ret = -EAGAIN;
>>>>>       }
>>>>>
>>>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>>>> index 6f9c62c746be..d039863e014b 100644
>>>>> --- a/mm/migrate.c
>>>>> +++ b/mm/migrate.c
>>>>> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio,
>>>>> struct list_head *list)
>>>>>       return true;
>>>>>    }
>>>>>
>>>>> +static bool try_to_map_unused_to_zeropage(struct
>>>>> page_vma_mapped_walk *pvmw,
>>>>> +                                      struct folio *folio,
>>>>> +                                      unsigned long idx)
>>>>> +{
>>>>> +    struct page *page = folio_page(folio, idx);
>>>>> +    bool contains_data;
>>>>> +    pte_t newpte;
>>>>> +    void *addr;
>>>>> +
>>>>> +    VM_BUG_ON_PAGE(PageCompound(page), page);
>>>>> +    VM_BUG_ON_PAGE(!PageAnon(page), page);
>>>>> +    VM_BUG_ON_PAGE(!PageLocked(page), page);
>>>>> +    VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>>>>> +
>>>>> +    if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags &
>>>>> VM_LOCKED) ||
>>>>> +        mm_forbids_zeropage(pvmw->vma->vm_mm))
>>>>> +            return false;
>>>>> +
>>>>> +    /*
>>>>> +     * The pmd entry mapping the old thp was flushed and the pte
>>>>> mapping
>>>>> +     * this subpage has been non present. If the subpage is only
>>>>> zero-filled
>>>>> +     * then map it to the shared zeropage.
>>>>> +     */
>>>>> +    addr = kmap_local_page(page);
>>>>> +    contains_data = memchr_inv(addr, 0, PAGE_SIZE);
>>>>> +    kunmap_local(addr);
>>>>> +
>>>>> +    if (contains_data)
>>>>> +            return false;
>>>>> +
>>>>> +    newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
>>>>> +                                    pvmw->vma->vm_page_prot));
>>>>> +    set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte,
>>>>> newpte);
>>>>> +
>>>>> +    dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
>>>>> +    return true;
>>>>> +}
>>>>> +
>>>>> +struct rmap_walk_arg {
>>>>> +    struct folio *folio;
>>>>> +    bool map_unused_to_zeropage;
>>>>> +};
>>>>> +
>>>>>    /*
>>>>>     * Restore a potential migration pte to a working pte entry
>>>>>     */
>>>>>    static bool remove_migration_pte(struct folio *folio,
>>>>> -            struct vm_area_struct *vma, unsigned long addr, void
>>>>> *old)
>>>>> +            struct vm_area_struct *vma, unsigned long addr, void
>>>>> *arg)
>>>>>    {
>>>>> -    DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC |
>>>>> PVMW_MIGRATION);
>>>>> +    struct rmap_walk_arg *rmap_walk_arg = arg;
>>>>> +    DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr,
>>>>> PVMW_SYNC | PVMW_MIGRATION);
>>>>>
>>>>>       while (page_vma_mapped_walk(&pvmw)) {
>>>>>               rmap_t rmap_flags = RMAP_NONE;
>>>>> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio
>>>>> *folio,
>>>>>                       continue;
>>>>>               }
>>>>>    #endif
>>>>> +            if (rmap_walk_arg->map_unused_to_zeropage &&
>>>>> +                try_to_map_unused_to_zeropage(&pvmw, folio,
>>>>> idx))
>>>>> +                    continue;
>>>>>
>>>>>               folio_get(folio);
>>>>>               pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
>>>>> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio
>>>>> *folio,
>>>>>     * Get rid of all migration entries and replace them by
>>>>>     * references to the indicated page.
>>>>>     */
>>>>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>>>>> bool locked)
>>>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>>>>> flags)
>>>>>    {
>>>>> +    struct rmap_walk_arg rmap_walk_arg = {
>>>>> +            .folio = src,
>>>>> +            .map_unused_to_zeropage = flags &
>>>>> RMP_USE_SHARED_ZEROPAGE,
>>>>> +    };
>>>>> +
>>>>>       struct rmap_walk_control rwc = {
>>>>>               .rmap_one = remove_migration_pte,
>>>>> -            .arg = src,
>>>>> +            .arg = &rmap_walk_arg,
>>>>>       };
>>>>>
>>>>> -    if (locked)
>>>>> +    VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src !=
>>>>> dst), src);
>>>>> +
>>>>> +    if (flags & RMP_LOCKED)
>>>>>               rmap_walk_locked(dst, &rwc);
>>>>>       else
>>>>>               rmap_walk(dst, &rwc);
>>>>> @@ -934,7 +988,7 @@ static int writeout(struct address_space
>>>>> *mapping, struct folio *folio)
>>>>>        * At this point we know that the migration attempt cannot
>>>>>        * be successful.
>>>>>        */
>>>>> -    remove_migration_ptes(folio, folio, false);
>>>>> +    remove_migration_ptes(folio, folio, 0);
>>>>>
>>>>>       rc = mapping->a_ops->writepage(&folio->page, &wbc);
>>>>>
>>>>> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio
>>>>> *src,
>>>>>                                  struct list_head *ret)
>>>>>    {
>>>>>       if (page_was_mapped)
>>>>> -            remove_migration_ptes(src, src, false);
>>>>> +            remove_migration_ptes(src, src, 0);
>>>>>       /* Drop an anon_vma reference if we took one */
>>>>>       if (anon_vma)
>>>>>               put_anon_vma(anon_vma);
>>>>> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t
>>>>> put_new_folio, unsigned long private,
>>>>>               lru_add_drain();
>>>>>
>>>>>       if (old_page_state & PAGE_WAS_MAPPED)
>>>>> -            remove_migration_ptes(src, dst, false);
>>>>> +            remove_migration_ptes(src, dst, 0);
>>>>>
>>>>>    out_unlock_both:
>>>>>       folio_unlock(dst);
>>>>> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t
>>>>> get_new_folio,
>>>>>
>>>>>       if (page_was_mapped)
>>>>>               remove_migration_ptes(src,
>>>>> -                    rc == MIGRATEPAGE_SUCCESS ? dst : src,
>>>>> false);
>>>>> +                    rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>>>>>
>>>>>    unlock_put_anon:
>>>>>       folio_unlock(dst);
>>>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>>>> index 8d687de88a03..9cf26592ac93 100644
>>>>> --- a/mm/migrate_device.c
>>>>> +++ b/mm/migrate_device.c
>>>>> @@ -424,7 +424,7 @@ static unsigned long
>>>>> migrate_device_unmap(unsigned long *src_pfns,
>>>>>                       continue;
>>>>>
>>>>>               folio = page_folio(page);
>>>>> -            remove_migration_ptes(folio, folio, false);
>>>>> +            remove_migration_ptes(folio, folio, 0);
>>>>>
>>>>>               src_pfns[i] = 0;
>>>>>               folio_unlock(folio);
>>>>> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long
>>>>> *src_pfns,
>>>>>                       dst = src;
>>>>>               }
>>>>>
>>>>> -            remove_migration_ptes(src, dst, false);
>>>>> +            remove_migration_ptes(src, dst, 0);
>>>>>               folio_unlock(src);
>>>>>
>>>>>               if (folio_is_zone_device(src))
>>>>
>>>> Hi,
>>>>
>>>> This patch has been in the mainline for some time, but we recently
>>>> discovered an issue when both mTHP and MTE (Memory Tagging Extension)
>>>> are enabled.
>>>>
>>>> It seems that remapping to the same zeropage might causes MTE tag
>>>> mismatches, since MTE tags are associated with physical addresses.
>>>
>>> Does this only trigger when the VMA has mte enabled? Maybe we'll have to
>>> bail out if we detect that mte is enabled.
>>
>> It seems RISC-V also has a similar feature (RISCV_ISA_SUPM) that uses
>> the same prctl(PR_{GET,SET}_TAGGED_ADDR_CTRL) API.
>>
>> config RISCV_ISA_SUPM
>>          bool "Supm extension for userspace pointer masking"
>>          depends on 64BIT
>>          default y
>>          help
>>            Add support for pointer masking in userspace (Supm) when the
>>            underlying hardware extension (Smnpm or Ssnpm) is detected 
>> at boot.
>>
>>            If this option is disabled, userspace will be unable to use
>>            the prctl(PR_{SET,GET}_TAGGED_ADDR_CTRL) API.
>>
>> I wonder if we should disable the THP shrinker for such architectures 
>> that
> 
> I think where possible we really only want to identify problematic 
> (tagged) pages and skip them. And we should either look into fixing KSM 
> as well or finding out why KSM is not affected.

Yeah. Seems like we could introduce a new helper, 
folio_test_mte_tagged(struct
folio *folio). By default, it would return false, and architectures like 
arm64
can override it.

Looking at the code, the PG_mte_tagged flag is not set for regular THP. 
The MTE
status actually comes from the VM_MTE flag in the VMA that maps it.

static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
{
	bool ret = test_bit(PG_mte_tagged, &folio->flags.f);

	VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));

	/*
	 * If the folio is tagged, ensure ordering with a likely subsequent
	 * read of the tags.
	 */
	if (ret)
		smp_rmb();
	return ret;
}

static inline bool page_mte_tagged(struct page *page)
{
	bool ret = test_bit(PG_mte_tagged, &page->flags.f);

	VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));

	/*
	 * If the page is tagged, ensure ordering with a likely subsequent
	 * read of the tags.
	 */
	if (ret)
		smp_rmb();
	return ret;
}

contpte_set_ptes()
	__set_ptes()
		__set_ptes_anysz()
			__sync_cache_and_tags()
				mte_sync_tags()
					set_page_mte_tagged()

Then, having the THP shrinker skip any folios that are identified as
MTE-tagged.
Cheers,
Lance

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by David Hildenbrand 4 months, 3 weeks ago

>> I think where possible we really only want to identify problematic
>> (tagged) pages and skip them. And we should either look into fixing KSM
>> as well or finding out why KSM is not affected.
> 
> Yeah. Seems like we could introduce a new helper,
> folio_test_mte_tagged(struct
> folio *folio). By default, it would return false, and architectures like
> arm64
> can override it.

If we add a new helper it should instead express the semantics that we cannot deduplicate.

For THP, I recall that only some pages might be tagged. So likely we want to check per page.

> 
> Looking at the code, the PG_mte_tagged flag is not set for regular THP.

I think it's supported for THP per page. Only for hugetlb we tag the whole thing through the head page instead of individual pages.

> The MTE
> status actually comes from the VM_MTE flag in the VMA that maps it.
> 

During the rmap walk we could check the VMA flag, but there would be no way to just stop the THP shrinker scanning this page early.

> static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
> {
> 	bool ret = test_bit(PG_mte_tagged, &folio->flags.f);
> 
> 	VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
> 
> 	/*
> 	 * If the folio is tagged, ensure ordering with a likely subsequent
> 	 * read of the tags.
> 	 */
> 	if (ret)
> 		smp_rmb();
> 	return ret;
> }
> 
> static inline bool page_mte_tagged(struct page *page)
> {
> 	bool ret = test_bit(PG_mte_tagged, &page->flags.f);
> 
> 	VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
> 
> 	/*
> 	 * If the page is tagged, ensure ordering with a likely subsequent
> 	 * read of the tags.
> 	 */
> 	if (ret)
> 		smp_rmb();
> 	return ret;
> }
> 
> contpte_set_ptes()
> 	__set_ptes()
> 		__set_ptes_anysz()
> 			__sync_cache_and_tags()
> 				mte_sync_tags()
> 					set_page_mte_tagged()
> 
> Then, having the THP shrinker skip any folios that are identified as
> MTE-tagged.

Likely we should just do something like (maybe we want better naming)

#ifndef page_is_mergable
#define page_is_mergable(page) (true)
#endif

And for arm64 have it be

#define page_is_mergable(page) (!page_mte_tagged(page))


And then do

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1f0813b956436..1cac9093918d6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -4251,7 +4251,8 @@ static bool thp_underused(struct folio *folio)
  
         for (i = 0; i < folio_nr_pages(folio); i++) {
                 kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
-               if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
+               if (page_is_mergable(folio_page(folio, i)) &&
+                   !memchr_inv(kaddr, 0, PAGE_SIZE)) {
                         num_zero_pages++;
                         if (num_zero_pages > khugepaged_max_ptes_none) {
                                 kunmap_local(kaddr);
diff --git a/mm/migrate.c b/mm/migrate.c
index 946253c398072..476a9a9091bd3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -306,6 +306,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
  
         if (PageCompound(page))
                 return false;
+       if (!page_is_mergable(page))
+               return false;
         VM_BUG_ON_PAGE(!PageAnon(page), page);
         VM_BUG_ON_PAGE(!PageLocked(page), page);
         VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);


For KSM, similarly just bail out early. But still wondering if this is already checked
somehow for KSM.

-- 
Cheers

David / dhildenb

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by Lance Yang 4 months, 3 weeks ago


On 2025/9/19 15:55, David Hildenbrand wrote:
>>> I think where possible we really only want to identify problematic
>>> (tagged) pages and skip them. And we should either look into fixing KSM
>>> as well or finding out why KSM is not affected.
>>
>> Yeah. Seems like we could introduce a new helper,
>> folio_test_mte_tagged(struct
>> folio *folio). By default, it would return false, and architectures like
>> arm64
>> can override it.
> 
> If we add a new helper it should instead express the semantics that we 
> cannot deduplicate.

Agreed.

> 
> For THP, I recall that only some pages might be tagged. So likely we 
> want to check per page.

Yes, a per-page check would be simpler.

> 
>>
>> Looking at the code, the PG_mte_tagged flag is not set for regular THP.
> 
> I think it's supported for THP per page. Only for hugetlb we tag the 
> whole thing through the head page instead of individual pages.

Right. That's exactly what I meant.

> 
>> The MTE
>> status actually comes from the VM_MTE flag in the VMA that maps it.
>>
> 
> During the rmap walk we could check the VMA flag, but there would be no 
> way to just stop the THP shrinker scanning this page early.
> 
>> static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
>> {
>>     bool ret = test_bit(PG_mte_tagged, &folio->flags.f);
>>
>>     VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
>>
>>     /*
>>      * If the folio is tagged, ensure ordering with a likely subsequent
>>      * read of the tags.
>>      */
>>     if (ret)
>>         smp_rmb();
>>     return ret;
>> }
>>
>> static inline bool page_mte_tagged(struct page *page)
>> {
>>     bool ret = test_bit(PG_mte_tagged, &page->flags.f);
>>
>>     VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
>>
>>     /*
>>      * If the page is tagged, ensure ordering with a likely subsequent
>>      * read of the tags.
>>      */
>>     if (ret)
>>         smp_rmb();
>>     return ret;
>> }
>>
>> contpte_set_ptes()
>>     __set_ptes()
>>         __set_ptes_anysz()
>>             __sync_cache_and_tags()
>>                 mte_sync_tags()
>>                     set_page_mte_tagged()
>>
>> Then, having the THP shrinker skip any folios that are identified as
>> MTE-tagged.
> 
> Likely we should just do something like (maybe we want better naming)
> 
> #ifndef page_is_mergable
> #define page_is_mergable(page) (true)
> #endif


Maybe something like page_is_optimizable()? Just a thought ;p

> 
> And for arm64 have it be
> 
> #define page_is_mergable(page) (!page_mte_tagged(page))
> 
> 
> And then do
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 1f0813b956436..1cac9093918d6 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -4251,7 +4251,8 @@ static bool thp_underused(struct folio *folio)
> 
>          for (i = 0; i < folio_nr_pages(folio); i++) {
>                  kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
> -               if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
> +               if (page_is_mergable(folio_page(folio, i)) &&
> +                   !memchr_inv(kaddr, 0, PAGE_SIZE)) {
>                          num_zero_pages++;
>                          if (num_zero_pages > khugepaged_max_ptes_none) {
>                                  kunmap_local(kaddr);
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 946253c398072..476a9a9091bd3 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -306,6 +306,8 @@ static bool try_to_map_unused_to_zeropage(struct 
> page_vma_mapped_walk *pvmw,
> 
>          if (PageCompound(page))
>                  return false;
> +       if (!page_is_mergable(page))
> +               return false;
>          VM_BUG_ON_PAGE(!PageAnon(page), page);
>          VM_BUG_ON_PAGE(!PageLocked(page), page);
>          VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);

Looks good to me!

> 
> 
> For KSM, similarly just bail out early. But still wondering if this is 
> already checked
> somehow for KSM.

+1 I'm looking for a machine to test it on.

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by Lance Yang 4 months, 3 weeks ago


On 2025/9/19 16:14, Lance Yang wrote:
> 
> 
> On 2025/9/19 15:55, David Hildenbrand wrote:
>>>> I think where possible we really only want to identify problematic
>>>> (tagged) pages and skip them. And we should either look into fixing KSM
>>>> as well or finding out why KSM is not affected.
>>>
>>> Yeah. Seems like we could introduce a new helper,
>>> folio_test_mte_tagged(struct
>>> folio *folio). By default, it would return false, and architectures like
>>> arm64
>>> can override it.
>>
>> If we add a new helper it should instead express the semantics that we 
>> cannot deduplicate.
> 
> Agreed.
> 
>>
>> For THP, I recall that only some pages might be tagged. So likely we 
>> want to check per page.
> 
> Yes, a per-page check would be simpler.
> 
>>
>>>
>>> Looking at the code, the PG_mte_tagged flag is not set for regular THP.
>>
>> I think it's supported for THP per page. Only for hugetlb we tag the 
>> whole thing through the head page instead of individual pages.
> 
> Right. That's exactly what I meant.
> 
>>
>>> The MTE
>>> status actually comes from the VM_MTE flag in the VMA that maps it.
>>>
>>
>> During the rmap walk we could check the VMA flag, but there would be 
>> no way to just stop the THP shrinker scanning this page early.
>>
>>> static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
>>> {
>>>     bool ret = test_bit(PG_mte_tagged, &folio->flags.f);
>>>
>>>     VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
>>>
>>>     /*
>>>      * If the folio is tagged, ensure ordering with a likely subsequent
>>>      * read of the tags.
>>>      */
>>>     if (ret)
>>>         smp_rmb();
>>>     return ret;
>>> }
>>>
>>> static inline bool page_mte_tagged(struct page *page)
>>> {
>>>     bool ret = test_bit(PG_mte_tagged, &page->flags.f);
>>>
>>>     VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
>>>
>>>     /*
>>>      * If the page is tagged, ensure ordering with a likely subsequent
>>>      * read of the tags.
>>>      */
>>>     if (ret)
>>>         smp_rmb();
>>>     return ret;
>>> }
>>>
>>> contpte_set_ptes()
>>>     __set_ptes()
>>>         __set_ptes_anysz()
>>>             __sync_cache_and_tags()
>>>                 mte_sync_tags()
>>>                     set_page_mte_tagged()
>>>
>>> Then, having the THP shrinker skip any folios that are identified as
>>> MTE-tagged.
>>
>> Likely we should just do something like (maybe we want better naming)
>>
>> #ifndef page_is_mergable
>> #define page_is_mergable(page) (true)
>> #endif
> 
> 
> Maybe something like page_is_optimizable()? Just a thought ;p
> 
>>
>> And for arm64 have it be
>>
>> #define page_is_mergable(page) (!page_mte_tagged(page))
>>
>>
>> And then do
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 1f0813b956436..1cac9093918d6 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -4251,7 +4251,8 @@ static bool thp_underused(struct folio *folio)
>>
>>          for (i = 0; i < folio_nr_pages(folio); i++) {
>>                  kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
>> -               if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
>> +               if (page_is_mergable(folio_page(folio, i)) &&
>> +                   !memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>                          num_zero_pages++;
>>                          if (num_zero_pages > khugepaged_max_ptes_none) {
>>                                  kunmap_local(kaddr);
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 946253c398072..476a9a9091bd3 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -306,6 +306,8 @@ static bool try_to_map_unused_to_zeropage(struct 
>> page_vma_mapped_walk *pvmw,
>>
>>          if (PageCompound(page))
>>                  return false;
>> +       if (!page_is_mergable(page))
>> +               return false;
>>          VM_BUG_ON_PAGE(!PageAnon(page), page);
>>          VM_BUG_ON_PAGE(!PageLocked(page), page);
>>          VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
> 
> Looks good to me!
> 
>>
>>
>> For KSM, similarly just bail out early. But still wondering if this is 
>> already checked
>> somehow for KSM.
> 
> +1 I'm looking for a machine to test it on.

Interestingly, it seems KSM is already skipping MTE-tagged pages. My test,
running on a v6.8.0 kernel inside QEMU (with MTE enabled), shows no merging
activity for those pages ...

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by Lance Yang 4 months, 3 weeks ago

Hey David,

I believe I've found the exact reason why KSM skips MTE-tagged pages ;p

> 
> 
> On 2025/9/19 16:14, Lance Yang wrote:
>>
>>
>> On 2025/9/19 15:55, David Hildenbrand wrote:
>>>>> I think where possible we really only want to identify problematic
>>>>> (tagged) pages and skip them. And we should either look into fixing 
>>>>> KSM
>>>>> as well or finding out why KSM is not affected.
>>>>
>>>> Yeah. Seems like we could introduce a new helper,
>>>> folio_test_mte_tagged(struct
>>>> folio *folio). By default, it would return false, and architectures 
>>>> like
>>>> arm64
>>>> can override it.
>>>
>>> If we add a new helper it should instead express the semantics that 
>>> we cannot deduplicate.
>>
>> Agreed.
>>
>>>
>>> For THP, I recall that only some pages might be tagged. So likely we 
>>> want to check per page.
>>
>> Yes, a per-page check would be simpler.
>>
>>>
>>>>
>>>> Looking at the code, the PG_mte_tagged flag is not set for regular THP.
>>>
>>> I think it's supported for THP per page. Only for hugetlb we tag the 
>>> whole thing through the head page instead of individual pages.
>>
>> Right. That's exactly what I meant.
>>
>>>
>>>> The MTE
>>>> status actually comes from the VM_MTE flag in the VMA that maps it.
>>>>
>>>
>>> During the rmap walk we could check the VMA flag, but there would be 
>>> no way to just stop the THP shrinker scanning this page early.
>>>
>>>> static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
>>>> {
>>>>     bool ret = test_bit(PG_mte_tagged, &folio->flags.f);
>>>>
>>>>     VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
>>>>
>>>>     /*
>>>>      * If the folio is tagged, ensure ordering with a likely subsequent
>>>>      * read of the tags.
>>>>      */
>>>>     if (ret)
>>>>         smp_rmb();
>>>>     return ret;
>>>> }
>>>>
>>>> static inline bool page_mte_tagged(struct page *page)
>>>> {
>>>>     bool ret = test_bit(PG_mte_tagged, &page->flags.f);
>>>>
>>>>     VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
>>>>
>>>>     /*
>>>>      * If the page is tagged, ensure ordering with a likely subsequent
>>>>      * read of the tags.
>>>>      */
>>>>     if (ret)
>>>>         smp_rmb();
>>>>     return ret;
>>>> }
>>>>
>>>> contpte_set_ptes()
>>>>     __set_ptes()
>>>>         __set_ptes_anysz()
>>>>             __sync_cache_and_tags()
>>>>                 mte_sync_tags()
>>>>                     set_page_mte_tagged()
>>>>
>>>> Then, having the THP shrinker skip any folios that are identified as
>>>> MTE-tagged.
>>>
>>> Likely we should just do something like (maybe we want better naming)
>>>
>>> #ifndef page_is_mergable
>>> #define page_is_mergable(page) (true)
>>> #endif
>>
>>
>> Maybe something like page_is_optimizable()? Just a thought ;p
>>
>>>
>>> And for arm64 have it be
>>>
>>> #define page_is_mergable(page) (!page_mte_tagged(page))
>>>
>>>
>>> And then do
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 1f0813b956436..1cac9093918d6 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -4251,7 +4251,8 @@ static bool thp_underused(struct folio *folio)
>>>
>>>          for (i = 0; i < folio_nr_pages(folio); i++) {
>>>                  kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
>>> -               if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>> +               if (page_is_mergable(folio_page(folio, i)) &&
>>> +                   !memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>>                          num_zero_pages++;
>>>                          if (num_zero_pages > 
>>> khugepaged_max_ptes_none) {
>>>                                  kunmap_local(kaddr);
>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>> index 946253c398072..476a9a9091bd3 100644
>>> --- a/mm/migrate.c
>>> +++ b/mm/migrate.c
>>> @@ -306,6 +306,8 @@ static bool try_to_map_unused_to_zeropage(struct 
>>> page_vma_mapped_walk *pvmw,
>>>
>>>          if (PageCompound(page))
>>>                  return false;
>>> +       if (!page_is_mergable(page))
>>> +               return false;
>>>          VM_BUG_ON_PAGE(!PageAnon(page), page);
>>>          VM_BUG_ON_PAGE(!PageLocked(page), page);
>>>          VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
>>
>> Looks good to me!
>>
>>>
>>>
>>> For KSM, similarly just bail out early. But still wondering if this 
>>> is already checked
>>> somehow for KSM.
>>
>> +1 I'm looking for a machine to test it on.
> 
> Interestingly, it seems KSM is already skipping MTE-tagged pages. My test,
> running on a v6.8.0 kernel inside QEMU (with MTE enabled), shows no merging
> activity for those pages ...

KSM's call to pages_identical() ultimately leads to memcmp_pages(). The
arm64 implementation of memcmp_pages() in arch/arm64/kernel/mte.c contains
a specific check that prevents merging in this case.

try_to_merge_one_page()
	-> pages_identical()
		-> !memcmp_pages() Fails!
		-> replace_page()


int memcmp_pages(struct page *page1, struct page *page2)
{
	char *addr1, *addr2;
	int ret;

	addr1 = page_address(page1);
	addr2 = page_address(page2);
	ret = memcmp(addr1, addr2, PAGE_SIZE);

	if (!system_supports_mte() || ret)
		return ret;

	/*
	 * If the page content is identical but at least one of the pages is
	 * tagged, return non-zero to avoid KSM merging. If only one of the
	 * pages is tagged, __set_ptes() may zero or change the tags of the
	 * other page via mte_sync_tags().
	 */
	if (page_mte_tagged(page1) || page_mte_tagged(page2))
		return addr1 != addr2;

	return ret;
}

IIUC, if either page is MTE-tagged, memcmp_pages() intentionally returns
a non-zero value, which in turn causes pages_identical() to return false.

Cheers,
Lance

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by David Hildenbrand 4 months, 3 weeks ago

On 19.09.25 14:19, Lance Yang wrote:
> Hey David,
> 
> I believe I've found the exact reason why KSM skips MTE-tagged pages ;p
> 
>>
>>
>> On 2025/9/19 16:14, Lance Yang wrote:
>>>
>>>
>>> On 2025/9/19 15:55, David Hildenbrand wrote:
>>>>>> I think where possible we really only want to identify problematic
>>>>>> (tagged) pages and skip them. And we should either look into fixing
>>>>>> KSM
>>>>>> as well or finding out why KSM is not affected.
>>>>>
>>>>> Yeah. Seems like we could introduce a new helper,
>>>>> folio_test_mte_tagged(struct
>>>>> folio *folio). By default, it would return false, and architectures
>>>>> like
>>>>> arm64
>>>>> can override it.
>>>>
>>>> If we add a new helper it should instead express the semantics that
>>>> we cannot deduplicate.
>>>
>>> Agreed.
>>>
>>>>
>>>> For THP, I recall that only some pages might be tagged. So likely we
>>>> want to check per page.
>>>
>>> Yes, a per-page check would be simpler.
>>>
>>>>
>>>>>
>>>>> Looking at the code, the PG_mte_tagged flag is not set for regular THP.
>>>>
>>>> I think it's supported for THP per page. Only for hugetlb we tag the
>>>> whole thing through the head page instead of individual pages.
>>>
>>> Right. That's exactly what I meant.
>>>
>>>>
>>>>> The MTE
>>>>> status actually comes from the VM_MTE flag in the VMA that maps it.
>>>>>
>>>>
>>>> During the rmap walk we could check the VMA flag, but there would be
>>>> no way to just stop the THP shrinker scanning this page early.
>>>>
>>>>> static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
>>>>> {
>>>>>      bool ret = test_bit(PG_mte_tagged, &folio->flags.f);
>>>>>
>>>>>      VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
>>>>>
>>>>>      /*
>>>>>       * If the folio is tagged, ensure ordering with a likely subsequent
>>>>>       * read of the tags.
>>>>>       */
>>>>>      if (ret)
>>>>>          smp_rmb();
>>>>>      return ret;
>>>>> }
>>>>>
>>>>> static inline bool page_mte_tagged(struct page *page)
>>>>> {
>>>>>      bool ret = test_bit(PG_mte_tagged, &page->flags.f);
>>>>>
>>>>>      VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
>>>>>
>>>>>      /*
>>>>>       * If the page is tagged, ensure ordering with a likely subsequent
>>>>>       * read of the tags.
>>>>>       */
>>>>>      if (ret)
>>>>>          smp_rmb();
>>>>>      return ret;
>>>>> }
>>>>>
>>>>> contpte_set_ptes()
>>>>>      __set_ptes()
>>>>>          __set_ptes_anysz()
>>>>>              __sync_cache_and_tags()
>>>>>                  mte_sync_tags()
>>>>>                      set_page_mte_tagged()
>>>>>
>>>>> Then, having the THP shrinker skip any folios that are identified as
>>>>> MTE-tagged.
>>>>
>>>> Likely we should just do something like (maybe we want better naming)
>>>>
>>>> #ifndef page_is_mergable
>>>> #define page_is_mergable(page) (true)
>>>> #endif
>>>
>>>
>>> Maybe something like page_is_optimizable()? Just a thought ;p
>>>
>>>>
>>>> And for arm64 have it be
>>>>
>>>> #define page_is_mergable(page) (!page_mte_tagged(page))
>>>>
>>>>
>>>> And then do
>>>>
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 1f0813b956436..1cac9093918d6 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -4251,7 +4251,8 @@ static bool thp_underused(struct folio *folio)
>>>>
>>>>           for (i = 0; i < folio_nr_pages(folio); i++) {
>>>>                   kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
>>>> -               if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>>> +               if (page_is_mergable(folio_page(folio, i)) &&
>>>> +                   !memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>>>                           num_zero_pages++;
>>>>                           if (num_zero_pages >
>>>> khugepaged_max_ptes_none) {
>>>>                                   kunmap_local(kaddr);
>>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>>> index 946253c398072..476a9a9091bd3 100644
>>>> --- a/mm/migrate.c
>>>> +++ b/mm/migrate.c
>>>> @@ -306,6 +306,8 @@ static bool try_to_map_unused_to_zeropage(struct
>>>> page_vma_mapped_walk *pvmw,
>>>>
>>>>           if (PageCompound(page))
>>>>                   return false;
>>>> +       if (!page_is_mergable(page))
>>>> +               return false;
>>>>           VM_BUG_ON_PAGE(!PageAnon(page), page);
>>>>           VM_BUG_ON_PAGE(!PageLocked(page), page);
>>>>           VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
>>>
>>> Looks good to me!
>>>
>>>>
>>>>
>>>> For KSM, similarly just bail out early. But still wondering if this
>>>> is already checked
>>>> somehow for KSM.
>>>
>>> +1 I'm looking for a machine to test it on.
>>
>> Interestingly, it seems KSM is already skipping MTE-tagged pages. My test,
>> running on a v6.8.0 kernel inside QEMU (with MTE enabled), shows no merging
>> activity for those pages ...
> 
> KSM's call to pages_identical() ultimately leads to memcmp_pages(). The
> arm64 implementation of memcmp_pages() in arch/arm64/kernel/mte.c contains
> a specific check that prevents merging in this case.
> 
> try_to_merge_one_page()
> 	-> pages_identical()
> 		-> !memcmp_pages() Fails!
> 		-> replace_page()
> 
> 
> int memcmp_pages(struct page *page1, struct page *page2)
> {
> 	char *addr1, *addr2;
> 	int ret;
> 
> 	addr1 = page_address(page1);
> 	addr2 = page_address(page2);
> 	ret = memcmp(addr1, addr2, PAGE_SIZE);
> 
> 	if (!system_supports_mte() || ret)
> 		return ret;
> 
> 	/*
> 	 * If the page content is identical but at least one of the pages is
> 	 * tagged, return non-zero to avoid KSM merging. If only one of the
> 	 * pages is tagged, __set_ptes() may zero or change the tags of the
> 	 * other page via mte_sync_tags().
> 	 */
> 	if (page_mte_tagged(page1) || page_mte_tagged(page2))
> 		return addr1 != addr2;
> 
> 	return ret;
> }
> 
> IIUC, if either page is MTE-tagged, memcmp_pages() intentionally returns
> a non-zero value, which in turn causes pages_identical() to return false.

Cool, so we should likely just use that then in the shrinker code. Can 
you send a fix?

-- 
Cheers

David / dhildenb

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by Lance Yang 4 months, 3 weeks ago


On 2025/9/19 21:09, David Hildenbrand wrote:
> On 19.09.25 14:19, Lance Yang wrote:
>> Hey David,
>>
>> I believe I've found the exact reason why KSM skips MTE-tagged pages ;p
>>
>>>
>>>
>>> On 2025/9/19 16:14, Lance Yang wrote:
>>>>
>>>>
>>>> On 2025/9/19 15:55, David Hildenbrand wrote:
>>>>>>> I think where possible we really only want to identify problematic
>>>>>>> (tagged) pages and skip them. And we should either look into fixing
>>>>>>> KSM
>>>>>>> as well or finding out why KSM is not affected.
>>>>>>
>>>>>> Yeah. Seems like we could introduce a new helper,
>>>>>> folio_test_mte_tagged(struct
>>>>>> folio *folio). By default, it would return false, and architectures
>>>>>> like
>>>>>> arm64
>>>>>> can override it.
>>>>>
>>>>> If we add a new helper it should instead express the semantics that
>>>>> we cannot deduplicate.
>>>>
>>>> Agreed.
>>>>
>>>>>
>>>>> For THP, I recall that only some pages might be tagged. So likely we
>>>>> want to check per page.
>>>>
>>>> Yes, a per-page check would be simpler.
>>>>
>>>>>
>>>>>>
>>>>>> Looking at the code, the PG_mte_tagged flag is not set for regular 
>>>>>> THP.
>>>>>
>>>>> I think it's supported for THP per page. Only for hugetlb we tag the
>>>>> whole thing through the head page instead of individual pages.
>>>>
>>>> Right. That's exactly what I meant.
>>>>
>>>>>
>>>>>> The MTE
>>>>>> status actually comes from the VM_MTE flag in the VMA that maps it.
>>>>>>
>>>>>
>>>>> During the rmap walk we could check the VMA flag, but there would be
>>>>> no way to just stop the THP shrinker scanning this page early.
>>>>>
>>>>>> static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
>>>>>> {
>>>>>>      bool ret = test_bit(PG_mte_tagged, &folio->flags.f);
>>>>>>
>>>>>>      VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
>>>>>>
>>>>>>      /*
>>>>>>       * If the folio is tagged, ensure ordering with a likely 
>>>>>> subsequent
>>>>>>       * read of the tags.
>>>>>>       */
>>>>>>      if (ret)
>>>>>>          smp_rmb();
>>>>>>      return ret;
>>>>>> }
>>>>>>
>>>>>> static inline bool page_mte_tagged(struct page *page)
>>>>>> {
>>>>>>      bool ret = test_bit(PG_mte_tagged, &page->flags.f);
>>>>>>
>>>>>>      VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
>>>>>>
>>>>>>      /*
>>>>>>       * If the page is tagged, ensure ordering with a likely 
>>>>>> subsequent
>>>>>>       * read of the tags.
>>>>>>       */
>>>>>>      if (ret)
>>>>>>          smp_rmb();
>>>>>>      return ret;
>>>>>> }
>>>>>>
>>>>>> contpte_set_ptes()
>>>>>>      __set_ptes()
>>>>>>          __set_ptes_anysz()
>>>>>>              __sync_cache_and_tags()
>>>>>>                  mte_sync_tags()
>>>>>>                      set_page_mte_tagged()
>>>>>>
>>>>>> Then, having the THP shrinker skip any folios that are identified as
>>>>>> MTE-tagged.
>>>>>
>>>>> Likely we should just do something like (maybe we want better naming)
>>>>>
>>>>> #ifndef page_is_mergable
>>>>> #define page_is_mergable(page) (true)
>>>>> #endif
>>>>
>>>>
>>>> Maybe something like page_is_optimizable()? Just a thought ;p
>>>>
>>>>>
>>>>> And for arm64 have it be
>>>>>
>>>>> #define page_is_mergable(page) (!page_mte_tagged(page))
>>>>>
>>>>>
>>>>> And then do
>>>>>
>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>> index 1f0813b956436..1cac9093918d6 100644
>>>>> --- a/mm/huge_memory.c
>>>>> +++ b/mm/huge_memory.c
>>>>> @@ -4251,7 +4251,8 @@ static bool thp_underused(struct folio *folio)
>>>>>
>>>>>           for (i = 0; i < folio_nr_pages(folio); i++) {
>>>>>                   kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
>>>>> -               if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>>>> +               if (page_is_mergable(folio_page(folio, i)) &&
>>>>> +                   !memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>>>>                           num_zero_pages++;
>>>>>                           if (num_zero_pages >
>>>>> khugepaged_max_ptes_none) {
>>>>>                                   kunmap_local(kaddr);
>>>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>>>> index 946253c398072..476a9a9091bd3 100644
>>>>> --- a/mm/migrate.c
>>>>> +++ b/mm/migrate.c
>>>>> @@ -306,6 +306,8 @@ static bool try_to_map_unused_to_zeropage(struct
>>>>> page_vma_mapped_walk *pvmw,
>>>>>
>>>>>           if (PageCompound(page))
>>>>>                   return false;
>>>>> +       if (!page_is_mergable(page))
>>>>> +               return false;
>>>>>           VM_BUG_ON_PAGE(!PageAnon(page), page);
>>>>>           VM_BUG_ON_PAGE(!PageLocked(page), page);
>>>>>           VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
>>>>
>>>> Looks good to me!
>>>>
>>>>>
>>>>>
>>>>> For KSM, similarly just bail out early. But still wondering if this
>>>>> is already checked
>>>>> somehow for KSM.
>>>>
>>>> +1 I'm looking for a machine to test it on.
>>>
>>> Interestingly, it seems KSM is already skipping MTE-tagged pages. My 
>>> test,
>>> running on a v6.8.0 kernel inside QEMU (with MTE enabled), shows no 
>>> merging
>>> activity for those pages ...
>>
>> KSM's call to pages_identical() ultimately leads to memcmp_pages(). The
>> arm64 implementation of memcmp_pages() in arch/arm64/kernel/mte.c 
>> contains
>> a specific check that prevents merging in this case.
>>
>> try_to_merge_one_page()
>>     -> pages_identical()
>>         -> !memcmp_pages() Fails!
>>         -> replace_page()
>>
>>
>> int memcmp_pages(struct page *page1, struct page *page2)
>> {
>>     char *addr1, *addr2;
>>     int ret;
>>
>>     addr1 = page_address(page1);
>>     addr2 = page_address(page2);
>>     ret = memcmp(addr1, addr2, PAGE_SIZE);
>>
>>     if (!system_supports_mte() || ret)
>>         return ret;
>>
>>     /*
>>      * If the page content is identical but at least one of the pages is
>>      * tagged, return non-zero to avoid KSM merging. If only one of the
>>      * pages is tagged, __set_ptes() may zero or change the tags of the
>>      * other page via mte_sync_tags().
>>      */
>>     if (page_mte_tagged(page1) || page_mte_tagged(page2))
>>         return addr1 != addr2;
>>
>>     return ret;
>> }
>>
>> IIUC, if either page is MTE-tagged, memcmp_pages() intentionally returns
>> a non-zero value, which in turn causes pages_identical() to return false.
> 
> Cool, so we should likely just use that then in the shrinker code. Can 
> you send a fix?

Certainly! I'll get on that ;p

Cheers,
Lance

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by Lance Yang 4 months, 3 weeks ago


On 2025/9/19 20:19, Lance Yang wrote:
> Hey David,
> 
> I believe I've found the exact reason why KSM skips MTE-tagged pages ;p
> 
>>
>>
>> On 2025/9/19 16:14, Lance Yang wrote:
>>>
>>>
>>> On 2025/9/19 15:55, David Hildenbrand wrote:
>>>>>> I think where possible we really only want to identify problematic
>>>>>> (tagged) pages and skip them. And we should either look into 
>>>>>> fixing KSM
>>>>>> as well or finding out why KSM is not affected.
>>>>>
>>>>> Yeah. Seems like we could introduce a new helper,
>>>>> folio_test_mte_tagged(struct
>>>>> folio *folio). By default, it would return false, and architectures 
>>>>> like
>>>>> arm64
>>>>> can override it.
>>>>
>>>> If we add a new helper it should instead express the semantics that 
>>>> we cannot deduplicate.
>>>
>>> Agreed.
>>>
>>>>
>>>> For THP, I recall that only some pages might be tagged. So likely we 
>>>> want to check per page.
>>>
>>> Yes, a per-page check would be simpler.
>>>
>>>>
>>>>>
>>>>> Looking at the code, the PG_mte_tagged flag is not set for regular 
>>>>> THP.
>>>>
>>>> I think it's supported for THP per page. Only for hugetlb we tag the 
>>>> whole thing through the head page instead of individual pages.
>>>
>>> Right. That's exactly what I meant.
>>>
>>>>
>>>>> The MTE
>>>>> status actually comes from the VM_MTE flag in the VMA that maps it.
>>>>>
>>>>
>>>> During the rmap walk we could check the VMA flag, but there would be 
>>>> no way to just stop the THP shrinker scanning this page early.
>>>>
>>>>> static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
>>>>> {
>>>>>     bool ret = test_bit(PG_mte_tagged, &folio->flags.f);
>>>>>
>>>>>     VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
>>>>>
>>>>>     /*
>>>>>      * If the folio is tagged, ensure ordering with a likely 
>>>>> subsequent
>>>>>      * read of the tags.
>>>>>      */
>>>>>     if (ret)
>>>>>         smp_rmb();
>>>>>     return ret;
>>>>> }
>>>>>
>>>>> static inline bool page_mte_tagged(struct page *page)
>>>>> {
>>>>>     bool ret = test_bit(PG_mte_tagged, &page->flags.f);
>>>>>
>>>>>     VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
>>>>>
>>>>>     /*
>>>>>      * If the page is tagged, ensure ordering with a likely subsequent
>>>>>      * read of the tags.
>>>>>      */
>>>>>     if (ret)
>>>>>         smp_rmb();
>>>>>     return ret;
>>>>> }
>>>>>
>>>>> contpte_set_ptes()
>>>>>     __set_ptes()
>>>>>         __set_ptes_anysz()
>>>>>             __sync_cache_and_tags()
>>>>>                 mte_sync_tags()
>>>>>                     set_page_mte_tagged()
>>>>>
>>>>> Then, having the THP shrinker skip any folios that are identified as
>>>>> MTE-tagged.
>>>>
>>>> Likely we should just do something like (maybe we want better naming)
>>>>
>>>> #ifndef page_is_mergable
>>>> #define page_is_mergable(page) (true)
>>>> #endif
>>>
>>>
>>> Maybe something like page_is_optimizable()? Just a thought ;p
>>>
>>>>
>>>> And for arm64 have it be
>>>>
>>>> #define page_is_mergable(page) (!page_mte_tagged(page))
>>>>
>>>>
>>>> And then do
>>>>
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 1f0813b956436..1cac9093918d6 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -4251,7 +4251,8 @@ static bool thp_underused(struct folio *folio)
>>>>
>>>>          for (i = 0; i < folio_nr_pages(folio); i++) {
>>>>                  kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
>>>> -               if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>>> +               if (page_is_mergable(folio_page(folio, i)) &&
>>>> +                   !memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>>>                          num_zero_pages++;
>>>>                          if (num_zero_pages > 
>>>> khugepaged_max_ptes_none) {
>>>>                                  kunmap_local(kaddr);
>>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>>> index 946253c398072..476a9a9091bd3 100644
>>>> --- a/mm/migrate.c
>>>> +++ b/mm/migrate.c
>>>> @@ -306,6 +306,8 @@ static bool try_to_map_unused_to_zeropage(struct 
>>>> page_vma_mapped_walk *pvmw,
>>>>
>>>>          if (PageCompound(page))
>>>>                  return false;
>>>> +       if (!page_is_mergable(page))
>>>> +               return false;
>>>>          VM_BUG_ON_PAGE(!PageAnon(page), page);
>>>>          VM_BUG_ON_PAGE(!PageLocked(page), page);
>>>>          VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
>>>
>>> Looks good to me!
>>>
>>>>
>>>>
>>>> For KSM, similarly just bail out early. But still wondering if this 
>>>> is already checked
>>>> somehow for KSM.
>>>
>>> +1 I'm looking for a machine to test it on.
>>
>> Interestingly, it seems KSM is already skipping MTE-tagged pages. My 
>> test,
>> running on a v6.8.0 kernel inside QEMU (with MTE enabled), shows no 
>> merging
>> activity for those pages ...
> 
> KSM's call to pages_identical() ultimately leads to memcmp_pages(). The
> arm64 implementation of memcmp_pages() in arch/arm64/kernel/mte.c contains
> a specific check that prevents merging in this case.
> 
> try_to_merge_one_page()
>      -> pages_identical()
>          -> !memcmp_pages() Fails!
>          -> replace_page()

Forgot to add:

memcmp_pages() is also called in other KSM paths, such as
stable_tree_search(), stable_tree_insert(), and
unstable_tree_search_insert(), effectively blocking MTE-tagged
pages from entering either of KSM's trees.

> 
> 
> int memcmp_pages(struct page *page1, struct page *page2)
> {
>      char *addr1, *addr2;
>      int ret;
> 
>      addr1 = page_address(page1);
>      addr2 = page_address(page2);
>      ret = memcmp(addr1, addr2, PAGE_SIZE);
> 
>      if (!system_supports_mte() || ret)
>          return ret;
> 
>      /*
>       * If the page content is identical but at least one of the pages is
>       * tagged, return non-zero to avoid KSM merging. If only one of the
>       * pages is tagged, __set_ptes() may zero or change the tags of the
>       * other page via mte_sync_tags().
>       */
>      if (page_mte_tagged(page1) || page_mte_tagged(page2))
>          return addr1 != addr2;
> 
>      return ret;
> }
> 
> IIUC, if either page is MTE-tagged, memcmp_pages() intentionally returns
> a non-zero value, which in turn causes pages_identical() to return false.
> 
> Cheers,
> Lance

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by Lance Yang 4 months, 3 weeks ago

On Thu, Sep 18, 2025 at 8:22 PM Lance Yang <lance.yang@linux.dev> wrote:
>
> On Thu, Sep 18, 2025 at 5:21 PM David Hildenbrand <david@redhat.com> wrote:
> >
> > On 18.09.25 10:53, Qun-wei Lin (林群崴) wrote:
> > > On Fri, 2024-08-30 at 11:03 +0100, Usama Arif wrote:
> > >> From: Yu Zhao <yuzhao@google.com>
> > >>
> > >> Here being unused means containing only zeros and inaccessible to
> > >> userspace. When splitting an isolated thp under reclaim or migration,
> > >> the unused subpages can be mapped to the shared zeropage, hence
> > >> saving
> > >> memory. This is particularly helpful when the internal
> > >> fragmentation of a thp is high, i.e. it has many untouched subpages.
> > >>
> > >> This is also a prerequisite for THP low utilization shrinker which
> > >> will
> > >> be introduced in later patches, where underutilized THPs are split,
> > >> and
> > >> the zero-filled pages are freed saving memory.
> > >>
> > >> Signed-off-by: Yu Zhao <yuzhao@google.com>
> > >> Tested-by: Shuang Zhai <zhais@google.com>
> > >> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> > >> ---
> > >>   include/linux/rmap.h |  7 ++++-
> > >>   mm/huge_memory.c     |  8 ++---
> > >>   mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++----
> > >> --
> > >>   mm/migrate_device.c  |  4 +--
> > >>   4 files changed, 75 insertions(+), 16 deletions(-)
> > >>
> > >> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> > >> index 91b5935e8485..d5e93e44322e 100644
> > >> --- a/include/linux/rmap.h
> > >> +++ b/include/linux/rmap.h
> > >> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
> > >>   int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages,
> > >> pgoff_t pgoff,
> > >>                    struct vm_area_struct *vma);
> > >>
> > >> -void remove_migration_ptes(struct folio *src, struct folio *dst,
> > >> bool locked);
> > >> +enum rmp_flags {
> > >> +    RMP_LOCKED              = 1 << 0,
> > >> +    RMP_USE_SHARED_ZEROPAGE = 1 << 1,
> > >> +};
> > >> +
> > >> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
> > >> flags);
> > >>
> > >>   /*
> > >>    * rmap_walk_control: To control rmap traversing for specific needs
> > >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > >> index 0c48806ccb9a..af60684e7c70 100644
> > >> --- a/mm/huge_memory.c
> > >> +++ b/mm/huge_memory.c
> > >> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct
> > >> vm_area_struct *vma, unsigned long addr,
> > >>      return false;
> > >>   }
> > >>
> > >> -static void remap_page(struct folio *folio, unsigned long nr)
> > >> +static void remap_page(struct folio *folio, unsigned long nr, int
> > >> flags)
> > >>   {
> > >>      int i = 0;
> > >>
> > >> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio,
> > >> unsigned long nr)
> > >>      if (!folio_test_anon(folio))
> > >>              return;
> > >>      for (;;) {
> > >> -            remove_migration_ptes(folio, folio, true);
> > >> +            remove_migration_ptes(folio, folio, RMP_LOCKED |
> > >> flags);
> > >>              i += folio_nr_pages(folio);
> > >>              if (i >= nr)
> > >>                      break;
> > >> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page
> > >> *page, struct list_head *list,
> > >>
> > >>      if (nr_dropped)
> > >>              shmem_uncharge(folio->mapping->host, nr_dropped);
> > >> -    remap_page(folio, nr);
> > >> +    remap_page(folio, nr, PageAnon(head) ?
> > >> RMP_USE_SHARED_ZEROPAGE : 0);
> > >>
> > >>      /*
> > >>       * set page to its compound_head when split to non order-0
> > >> pages, so
> > >> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct
> > >> page *page, struct list_head *list,
> > >>              if (mapping)
> > >>                      xas_unlock(&xas);
> > >>              local_irq_enable();
> > >> -            remap_page(folio, folio_nr_pages(folio));
> > >> +            remap_page(folio, folio_nr_pages(folio), 0);
> > >>              ret = -EAGAIN;
> > >>      }
> > >>
> > >> diff --git a/mm/migrate.c b/mm/migrate.c
> > >> index 6f9c62c746be..d039863e014b 100644
> > >> --- a/mm/migrate.c
> > >> +++ b/mm/migrate.c
> > >> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio,
> > >> struct list_head *list)
> > >>      return true;
> > >>   }
> > >>
> > >> +static bool try_to_map_unused_to_zeropage(struct
> > >> page_vma_mapped_walk *pvmw,
> > >> +                                      struct folio *folio,
> > >> +                                      unsigned long idx)
> > >> +{
> > >> +    struct page *page = folio_page(folio, idx);
> > >> +    bool contains_data;
> > >> +    pte_t newpte;
> > >> +    void *addr;
> > >> +
> > >> +    VM_BUG_ON_PAGE(PageCompound(page), page);
> > >> +    VM_BUG_ON_PAGE(!PageAnon(page), page);
> > >> +    VM_BUG_ON_PAGE(!PageLocked(page), page);
> > >> +    VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
> > >> +
> > >> +    if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags &
> > >> VM_LOCKED) ||
> > >> +        mm_forbids_zeropage(pvmw->vma->vm_mm))
> > >> +            return false;
> > >> +
> > >> +    /*
> > >> +     * The pmd entry mapping the old thp was flushed and the pte
> > >> mapping
> > >> +     * this subpage has been non present. If the subpage is only
> > >> zero-filled
> > >> +     * then map it to the shared zeropage.
> > >> +     */
> > >> +    addr = kmap_local_page(page);
> > >> +    contains_data = memchr_inv(addr, 0, PAGE_SIZE);
> > >> +    kunmap_local(addr);
> > >> +
> > >> +    if (contains_data)
> > >> +            return false;
> > >> +
> > >> +    newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
> > >> +                                    pvmw->vma->vm_page_prot));
> > >> +    set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte,
> > >> newpte);
> > >> +
> > >> +    dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
> > >> +    return true;
> > >> +}
> > >> +
> > >> +struct rmap_walk_arg {
> > >> +    struct folio *folio;
> > >> +    bool map_unused_to_zeropage;
> > >> +};
> > >> +
> > >>   /*
> > >>    * Restore a potential migration pte to a working pte entry
> > >>    */
> > >>   static bool remove_migration_pte(struct folio *folio,
> > >> -            struct vm_area_struct *vma, unsigned long addr, void
> > >> *old)
> > >> +            struct vm_area_struct *vma, unsigned long addr, void
> > >> *arg)
> > >>   {
> > >> -    DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC |
> > >> PVMW_MIGRATION);
> > >> +    struct rmap_walk_arg *rmap_walk_arg = arg;
> > >> +    DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr,
> > >> PVMW_SYNC | PVMW_MIGRATION);
> > >>
> > >>      while (page_vma_mapped_walk(&pvmw)) {
> > >>              rmap_t rmap_flags = RMAP_NONE;
> > >> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio
> > >> *folio,
> > >>                      continue;
> > >>              }
> > >>   #endif
> > >> +            if (rmap_walk_arg->map_unused_to_zeropage &&
> > >> +                try_to_map_unused_to_zeropage(&pvmw, folio,
> > >> idx))
> > >> +                    continue;
> > >>
> > >>              folio_get(folio);
> > >>              pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
> > >> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio
> > >> *folio,
> > >>    * Get rid of all migration entries and replace them by
> > >>    * references to the indicated page.
> > >>    */
> > >> -void remove_migration_ptes(struct folio *src, struct folio *dst,
> > >> bool locked)
> > >> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
> > >> flags)
> > >>   {
> > >> +    struct rmap_walk_arg rmap_walk_arg = {
> > >> +            .folio = src,
> > >> +            .map_unused_to_zeropage = flags &
> > >> RMP_USE_SHARED_ZEROPAGE,
> > >> +    };
> > >> +
> > >>      struct rmap_walk_control rwc = {
> > >>              .rmap_one = remove_migration_pte,
> > >> -            .arg = src,
> > >> +            .arg = &rmap_walk_arg,
> > >>      };
> > >>
> > >> -    if (locked)
> > >> +    VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src !=
> > >> dst), src);
> > >> +
> > >> +    if (flags & RMP_LOCKED)
> > >>              rmap_walk_locked(dst, &rwc);
> > >>      else
> > >>              rmap_walk(dst, &rwc);
> > >> @@ -934,7 +988,7 @@ static int writeout(struct address_space
> > >> *mapping, struct folio *folio)
> > >>       * At this point we know that the migration attempt cannot
> > >>       * be successful.
> > >>       */
> > >> -    remove_migration_ptes(folio, folio, false);
> > >> +    remove_migration_ptes(folio, folio, 0);
> > >>
> > >>      rc = mapping->a_ops->writepage(&folio->page, &wbc);
> > >>
> > >> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio
> > >> *src,
> > >>                                 struct list_head *ret)
> > >>   {
> > >>      if (page_was_mapped)
> > >> -            remove_migration_ptes(src, src, false);
> > >> +            remove_migration_ptes(src, src, 0);
> > >>      /* Drop an anon_vma reference if we took one */
> > >>      if (anon_vma)
> > >>              put_anon_vma(anon_vma);
> > >> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t
> > >> put_new_folio, unsigned long private,
> > >>              lru_add_drain();
> > >>
> > >>      if (old_page_state & PAGE_WAS_MAPPED)
> > >> -            remove_migration_ptes(src, dst, false);
> > >> +            remove_migration_ptes(src, dst, 0);
> > >>
> > >>   out_unlock_both:
> > >>      folio_unlock(dst);
> > >> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t
> > >> get_new_folio,
> > >>
> > >>      if (page_was_mapped)
> > >>              remove_migration_ptes(src,
> > >> -                    rc == MIGRATEPAGE_SUCCESS ? dst : src,
> > >> false);
> > >> +                    rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
> > >>
> > >>   unlock_put_anon:
> > >>      folio_unlock(dst);
> > >> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> > >> index 8d687de88a03..9cf26592ac93 100644
> > >> --- a/mm/migrate_device.c
> > >> +++ b/mm/migrate_device.c
> > >> @@ -424,7 +424,7 @@ static unsigned long
> > >> migrate_device_unmap(unsigned long *src_pfns,
> > >>                      continue;
> > >>
> > >>              folio = page_folio(page);
> > >> -            remove_migration_ptes(folio, folio, false);
> > >> +            remove_migration_ptes(folio, folio, 0);
> > >>
> > >>              src_pfns[i] = 0;
> > >>              folio_unlock(folio);
> > >> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long
> > >> *src_pfns,
> > >>                      dst = src;
> > >>              }
> > >>
> > >> -            remove_migration_ptes(src, dst, false);
> > >> +            remove_migration_ptes(src, dst, 0);
> > >>              folio_unlock(src);
> > >>
> > >>              if (folio_is_zone_device(src))
> > >
> > > Hi,
> > >
> > > This patch has been in the mainline for some time, but we recently
> > > discovered an issue when both mTHP and MTE (Memory Tagging Extension)
> > > are enabled.
> > >
> > > It seems that remapping to the same zeropage might causes MTE tag
> > > mismatches, since MTE tags are associated with physical addresses.
> >
> > Does this only trigger when the VMA has mte enabled? Maybe we'll have to
> > bail out if we detect that mte is enabled.
>
> It seems RISC-V also has a similar feature (RISCV_ISA_SUPM) that uses
> the same prctl(PR_{GET,SET}_TAGGED_ADDR_CTRL) API.
>
> config RISCV_ISA_SUPM
>         bool "Supm extension for userspace pointer masking"
>         depends on 64BIT
>         default y
>         help
>           Add support for pointer masking in userspace (Supm) when the
>           underlying hardware extension (Smnpm or Ssnpm) is detected at boot.
>
>           If this option is disabled, userspace will be unable to use
>           the prctl(PR_{SET,GET}_TAGGED_ADDR_CTRL) API.
>
> I wonder if we should disable the THP shrinker for such architectures that
> define PR_SET_TAGGED_ADDR_CTRL (or PR_GET_TAGGED_ADDR_CTRL).

SET_TAGGED_ADDR_CTRL or GET_TAGGED_ADDR_CTRL

File: kernel/sys.c 114
#ifndef SET_TAGGED_ADDR_CTRL
# define SET_TAGGED_ADDR_CTRL(a) (-EINVAL)
#endif
#ifndef GET_TAGGED_ADDR_CTRL
# define GET_TAGGED_ADDR_CTRL() (-EINVAL)
#endif

Cheers,
Lance

>
> Cheers,
> Lance
>
> >
> > Also, I wonder how KSM and the shared zeropage works in general with
> > that, because I would expect similar issues when we de-duplicate memory?
> >
> > --
> > Cheers
> >
> > David / dhildenb
> >
> >

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by Usama Arif 4 months, 3 weeks ago


On 18/09/2025 09:56, David Hildenbrand wrote:
> On 18.09.25 10:53, Qun-wei Lin (林群崴) wrote:
>> On Fri, 2024-08-30 at 11:03 +0100, Usama Arif wrote:
>>> From: Yu Zhao <yuzhao@google.com>
>>>
>>> Here being unused means containing only zeros and inaccessible to
>>> userspace. When splitting an isolated thp under reclaim or migration,
>>> the unused subpages can be mapped to the shared zeropage, hence
>>> saving
>>> memory. This is particularly helpful when the internal
>>> fragmentation of a thp is high, i.e. it has many untouched subpages.
>>>
>>> This is also a prerequisite for THP low utilization shrinker which
>>> will
>>> be introduced in later patches, where underutilized THPs are split,
>>> and
>>> the zero-filled pages are freed saving memory.
>>>
>>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>>> Tested-by: Shuang Zhai <zhais@google.com>
>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>> ---
>>>   include/linux/rmap.h |  7 ++++-
>>>   mm/huge_memory.c     |  8 ++---
>>>   mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++----
>>> -- 
>>>   mm/migrate_device.c  |  4 +--
>>>   4 files changed, 75 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>> index 91b5935e8485..d5e93e44322e 100644
>>> --- a/include/linux/rmap.h
>>> +++ b/include/linux/rmap.h
>>> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>>>   int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages,
>>> pgoff_t pgoff,
>>>                 struct vm_area_struct *vma);
>>>   -void remove_migration_ptes(struct folio *src, struct folio *dst,
>>> bool locked);
>>> +enum rmp_flags {
>>> +    RMP_LOCKED        = 1 << 0,
>>> +    RMP_USE_SHARED_ZEROPAGE    = 1 << 1,
>>> +};
>>> +
>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>>> flags);
>>>     /*
>>>    * rmap_walk_control: To control rmap traversing for specific needs
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 0c48806ccb9a..af60684e7c70 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct
>>> vm_area_struct *vma, unsigned long addr,
>>>       return false;
>>>   }
>>>   -static void remap_page(struct folio *folio, unsigned long nr)
>>> +static void remap_page(struct folio *folio, unsigned long nr, int
>>> flags)
>>>   {
>>>       int i = 0;
>>>   @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio,
>>> unsigned long nr)
>>>       if (!folio_test_anon(folio))
>>>           return;
>>>       for (;;) {
>>> -        remove_migration_ptes(folio, folio, true);
>>> +        remove_migration_ptes(folio, folio, RMP_LOCKED |
>>> flags);
>>>           i += folio_nr_pages(folio);
>>>           if (i >= nr)
>>>               break;
>>> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page
>>> *page, struct list_head *list,
>>>         if (nr_dropped)
>>>           shmem_uncharge(folio->mapping->host, nr_dropped);
>>> -    remap_page(folio, nr);
>>> +    remap_page(folio, nr, PageAnon(head) ?
>>> RMP_USE_SHARED_ZEROPAGE : 0);
>>>         /*
>>>        * set page to its compound_head when split to non order-0
>>> pages, so
>>> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct
>>> page *page, struct list_head *list,
>>>           if (mapping)
>>>               xas_unlock(&xas);
>>>           local_irq_enable();
>>> -        remap_page(folio, folio_nr_pages(folio));
>>> +        remap_page(folio, folio_nr_pages(folio), 0);
>>>           ret = -EAGAIN;
>>>       }
>>>   diff --git a/mm/migrate.c b/mm/migrate.c
>>> index 6f9c62c746be..d039863e014b 100644
>>> --- a/mm/migrate.c
>>> +++ b/mm/migrate.c
>>> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio,
>>> struct list_head *list)
>>>       return true;
>>>   }
>>>   +static bool try_to_map_unused_to_zeropage(struct
>>> page_vma_mapped_walk *pvmw,
>>> +                      struct folio *folio,
>>> +                      unsigned long idx)
>>> +{
>>> +    struct page *page = folio_page(folio, idx);
>>> +    bool contains_data;
>>> +    pte_t newpte;
>>> +    void *addr;
>>> +
>>> +    VM_BUG_ON_PAGE(PageCompound(page), page);
>>> +    VM_BUG_ON_PAGE(!PageAnon(page), page);
>>> +    VM_BUG_ON_PAGE(!PageLocked(page), page);
>>> +    VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>>> +
>>> +    if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags &
>>> VM_LOCKED) ||
>>> +        mm_forbids_zeropage(pvmw->vma->vm_mm))
>>> +        return false;
>>> +
>>> +    /*
>>> +     * The pmd entry mapping the old thp was flushed and the pte
>>> mapping
>>> +     * this subpage has been non present. If the subpage is only
>>> zero-filled
>>> +     * then map it to the shared zeropage.
>>> +     */
>>> +    addr = kmap_local_page(page);
>>> +    contains_data = memchr_inv(addr, 0, PAGE_SIZE);
>>> +    kunmap_local(addr);
>>> +
>>> +    if (contains_data)
>>> +        return false;
>>> +
>>> +    newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
>>> +                    pvmw->vma->vm_page_prot));
>>> +    set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte,
>>> newpte);
>>> +
>>> +    dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
>>> +    return true;
>>> +}
>>> +
>>> +struct rmap_walk_arg {
>>> +    struct folio *folio;
>>> +    bool map_unused_to_zeropage;
>>> +};
>>> +
>>>   /*
>>>    * Restore a potential migration pte to a working pte entry
>>>    */
>>>   static bool remove_migration_pte(struct folio *folio,
>>> -        struct vm_area_struct *vma, unsigned long addr, void
>>> *old)
>>> +        struct vm_area_struct *vma, unsigned long addr, void
>>> *arg)
>>>   {
>>> -    DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC |
>>> PVMW_MIGRATION);
>>> +    struct rmap_walk_arg *rmap_walk_arg = arg;
>>> +    DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr,
>>> PVMW_SYNC | PVMW_MIGRATION);
>>>         while (page_vma_mapped_walk(&pvmw)) {
>>>           rmap_t rmap_flags = RMAP_NONE;
>>> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio
>>> *folio,
>>>               continue;
>>>           }
>>>   #endif
>>> +        if (rmap_walk_arg->map_unused_to_zeropage &&
>>> +            try_to_map_unused_to_zeropage(&pvmw, folio,
>>> idx))
>>> +            continue;
>>>             folio_get(folio);
>>>           pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
>>> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio
>>> *folio,
>>>    * Get rid of all migration entries and replace them by
>>>    * references to the indicated page.
>>>    */
>>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>>> bool locked)
>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>>> flags)
>>>   {
>>> +    struct rmap_walk_arg rmap_walk_arg = {
>>> +        .folio = src,
>>> +        .map_unused_to_zeropage = flags &
>>> RMP_USE_SHARED_ZEROPAGE,
>>> +    };
>>> +
>>>       struct rmap_walk_control rwc = {
>>>           .rmap_one = remove_migration_pte,
>>> -        .arg = src,
>>> +        .arg = &rmap_walk_arg,
>>>       };
>>>   -    if (locked)
>>> +    VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src !=
>>> dst), src);
>>> +
>>> +    if (flags & RMP_LOCKED)
>>>           rmap_walk_locked(dst, &rwc);
>>>       else
>>>           rmap_walk(dst, &rwc);
>>> @@ -934,7 +988,7 @@ static int writeout(struct address_space
>>> *mapping, struct folio *folio)
>>>        * At this point we know that the migration attempt cannot
>>>        * be successful.
>>>        */
>>> -    remove_migration_ptes(folio, folio, false);
>>> +    remove_migration_ptes(folio, folio, 0);
>>>         rc = mapping->a_ops->writepage(&folio->page, &wbc);
>>>   @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio
>>> *src,
>>>                      struct list_head *ret)
>>>   {
>>>       if (page_was_mapped)
>>> -        remove_migration_ptes(src, src, false);
>>> +        remove_migration_ptes(src, src, 0);
>>>       /* Drop an anon_vma reference if we took one */
>>>       if (anon_vma)
>>>           put_anon_vma(anon_vma);
>>> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t
>>> put_new_folio, unsigned long private,
>>>           lru_add_drain();
>>>         if (old_page_state & PAGE_WAS_MAPPED)
>>> -        remove_migration_ptes(src, dst, false);
>>> +        remove_migration_ptes(src, dst, 0);
>>>     out_unlock_both:
>>>       folio_unlock(dst);
>>> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t
>>> get_new_folio,
>>>         if (page_was_mapped)
>>>           remove_migration_ptes(src,
>>> -            rc == MIGRATEPAGE_SUCCESS ? dst : src,
>>> false);
>>> +            rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>>>     unlock_put_anon:
>>>       folio_unlock(dst);
>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>> index 8d687de88a03..9cf26592ac93 100644
>>> --- a/mm/migrate_device.c
>>> +++ b/mm/migrate_device.c
>>> @@ -424,7 +424,7 @@ static unsigned long
>>> migrate_device_unmap(unsigned long *src_pfns,
>>>               continue;
>>>             folio = page_folio(page);
>>> -        remove_migration_ptes(folio, folio, false);
>>> +        remove_migration_ptes(folio, folio, 0);
>>>             src_pfns[i] = 0;
>>>           folio_unlock(folio);
>>> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long
>>> *src_pfns,
>>>               dst = src;
>>>           }
>>>   -        remove_migration_ptes(src, dst, false);
>>> +        remove_migration_ptes(src, dst, 0);
>>>           folio_unlock(src);
>>>             if (folio_is_zone_device(src))
>>
>> Hi,
>>
>> This patch has been in the mainline for some time, but we recently
>> discovered an issue when both mTHP and MTE (Memory Tagging Extension)
>> are enabled.
>>
>> It seems that remapping to the same zeropage might causes MTE tag
>> mismatches, since MTE tags are associated with physical addresses.
> 
> Does this only trigger when the VMA has mte enabled? Maybe we'll have to bail out if we detect that mte is enabled.
> 

I believe MTE is all or nothing? i.e. all the memory is tagged when enabled, but will let the arm folks confirm.

Yeah unfortunately I think that might be the only way. We cant change the pointers and I dont think there is a way
to mark the memory as "untagged". If we cant remap to zeropage, then there is no point of shrinker. I am guessing
instead of checking at runtime whether mte is enabled when remapping to shared zeropage, we need to ifndef the
shrinker if CONFIG_ARM64_MTE is enabled?

> Also, I wonder how KSM and the shared zeropage works in general with that, because I would expect similar issues when we de-duplicate memory?
> 

Yeah thats a very good point!

Also the initial report mentioned mTHP instead of THP, but I dont think that matters.

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by David Hildenbrand 4 months, 3 weeks ago

On 18.09.25 13:42, Usama Arif wrote:
> 
> 
> On 18/09/2025 09:56, David Hildenbrand wrote:
>> On 18.09.25 10:53, Qun-wei Lin (林群崴) wrote:
>>> On Fri, 2024-08-30 at 11:03 +0100, Usama Arif wrote:
>>>> From: Yu Zhao <yuzhao@google.com>
>>>>
>>>> Here being unused means containing only zeros and inaccessible to
>>>> userspace. When splitting an isolated thp under reclaim or migration,
>>>> the unused subpages can be mapped to the shared zeropage, hence
>>>> saving
>>>> memory. This is particularly helpful when the internal
>>>> fragmentation of a thp is high, i.e. it has many untouched subpages.
>>>>
>>>> This is also a prerequisite for THP low utilization shrinker which
>>>> will
>>>> be introduced in later patches, where underutilized THPs are split,
>>>> and
>>>> the zero-filled pages are freed saving memory.
>>>>
>>>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>>>> Tested-by: Shuang Zhai <zhais@google.com>
>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>> ---
>>>>    include/linux/rmap.h |  7 ++++-
>>>>    mm/huge_memory.c     |  8 ++---
>>>>    mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++----
>>>> -- 
>>>>    mm/migrate_device.c  |  4 +--
>>>>    4 files changed, 75 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>>> index 91b5935e8485..d5e93e44322e 100644
>>>> --- a/include/linux/rmap.h
>>>> +++ b/include/linux/rmap.h
>>>> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>>>>    int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages,
>>>> pgoff_t pgoff,
>>>>                  struct vm_area_struct *vma);
>>>>    -void remove_migration_ptes(struct folio *src, struct folio *dst,
>>>> bool locked);
>>>> +enum rmp_flags {
>>>> +    RMP_LOCKED        = 1 << 0,
>>>> +    RMP_USE_SHARED_ZEROPAGE    = 1 << 1,
>>>> +};
>>>> +
>>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>>>> flags);
>>>>      /*
>>>>     * rmap_walk_control: To control rmap traversing for specific needs
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 0c48806ccb9a..af60684e7c70 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct
>>>> vm_area_struct *vma, unsigned long addr,
>>>>        return false;
>>>>    }
>>>>    -static void remap_page(struct folio *folio, unsigned long nr)
>>>> +static void remap_page(struct folio *folio, unsigned long nr, int
>>>> flags)
>>>>    {
>>>>        int i = 0;
>>>>    @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio,
>>>> unsigned long nr)
>>>>        if (!folio_test_anon(folio))
>>>>            return;
>>>>        for (;;) {
>>>> -        remove_migration_ptes(folio, folio, true);
>>>> +        remove_migration_ptes(folio, folio, RMP_LOCKED |
>>>> flags);
>>>>            i += folio_nr_pages(folio);
>>>>            if (i >= nr)
>>>>                break;
>>>> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page
>>>> *page, struct list_head *list,
>>>>          if (nr_dropped)
>>>>            shmem_uncharge(folio->mapping->host, nr_dropped);
>>>> -    remap_page(folio, nr);
>>>> +    remap_page(folio, nr, PageAnon(head) ?
>>>> RMP_USE_SHARED_ZEROPAGE : 0);
>>>>          /*
>>>>         * set page to its compound_head when split to non order-0
>>>> pages, so
>>>> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct
>>>> page *page, struct list_head *list,
>>>>            if (mapping)
>>>>                xas_unlock(&xas);
>>>>            local_irq_enable();
>>>> -        remap_page(folio, folio_nr_pages(folio));
>>>> +        remap_page(folio, folio_nr_pages(folio), 0);
>>>>            ret = -EAGAIN;
>>>>        }
>>>>    diff --git a/mm/migrate.c b/mm/migrate.c
>>>> index 6f9c62c746be..d039863e014b 100644
>>>> --- a/mm/migrate.c
>>>> +++ b/mm/migrate.c
>>>> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio,
>>>> struct list_head *list)
>>>>        return true;
>>>>    }
>>>>    +static bool try_to_map_unused_to_zeropage(struct
>>>> page_vma_mapped_walk *pvmw,
>>>> +                      struct folio *folio,
>>>> +                      unsigned long idx)
>>>> +{
>>>> +    struct page *page = folio_page(folio, idx);
>>>> +    bool contains_data;
>>>> +    pte_t newpte;
>>>> +    void *addr;
>>>> +
>>>> +    VM_BUG_ON_PAGE(PageCompound(page), page);
>>>> +    VM_BUG_ON_PAGE(!PageAnon(page), page);
>>>> +    VM_BUG_ON_PAGE(!PageLocked(page), page);
>>>> +    VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>>>> +
>>>> +    if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags &
>>>> VM_LOCKED) ||
>>>> +        mm_forbids_zeropage(pvmw->vma->vm_mm))
>>>> +        return false;
>>>> +
>>>> +    /*
>>>> +     * The pmd entry mapping the old thp was flushed and the pte
>>>> mapping
>>>> +     * this subpage has been non present. If the subpage is only
>>>> zero-filled
>>>> +     * then map it to the shared zeropage.
>>>> +     */
>>>> +    addr = kmap_local_page(page);
>>>> +    contains_data = memchr_inv(addr, 0, PAGE_SIZE);
>>>> +    kunmap_local(addr);
>>>> +
>>>> +    if (contains_data)
>>>> +        return false;
>>>> +
>>>> +    newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
>>>> +                    pvmw->vma->vm_page_prot));
>>>> +    set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte,
>>>> newpte);
>>>> +
>>>> +    dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
>>>> +    return true;
>>>> +}
>>>> +
>>>> +struct rmap_walk_arg {
>>>> +    struct folio *folio;
>>>> +    bool map_unused_to_zeropage;
>>>> +};
>>>> +
>>>>    /*
>>>>     * Restore a potential migration pte to a working pte entry
>>>>     */
>>>>    static bool remove_migration_pte(struct folio *folio,
>>>> -        struct vm_area_struct *vma, unsigned long addr, void
>>>> *old)
>>>> +        struct vm_area_struct *vma, unsigned long addr, void
>>>> *arg)
>>>>    {
>>>> -    DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC |
>>>> PVMW_MIGRATION);
>>>> +    struct rmap_walk_arg *rmap_walk_arg = arg;
>>>> +    DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr,
>>>> PVMW_SYNC | PVMW_MIGRATION);
>>>>          while (page_vma_mapped_walk(&pvmw)) {
>>>>            rmap_t rmap_flags = RMAP_NONE;
>>>> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio
>>>> *folio,
>>>>                continue;
>>>>            }
>>>>    #endif
>>>> +        if (rmap_walk_arg->map_unused_to_zeropage &&
>>>> +            try_to_map_unused_to_zeropage(&pvmw, folio,
>>>> idx))
>>>> +            continue;
>>>>              folio_get(folio);
>>>>            pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
>>>> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio
>>>> *folio,
>>>>     * Get rid of all migration entries and replace them by
>>>>     * references to the indicated page.
>>>>     */
>>>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>>>> bool locked)
>>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>>>> flags)
>>>>    {
>>>> +    struct rmap_walk_arg rmap_walk_arg = {
>>>> +        .folio = src,
>>>> +        .map_unused_to_zeropage = flags &
>>>> RMP_USE_SHARED_ZEROPAGE,
>>>> +    };
>>>> +
>>>>        struct rmap_walk_control rwc = {
>>>>            .rmap_one = remove_migration_pte,
>>>> -        .arg = src,
>>>> +        .arg = &rmap_walk_arg,
>>>>        };
>>>>    -    if (locked)
>>>> +    VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src !=
>>>> dst), src);
>>>> +
>>>> +    if (flags & RMP_LOCKED)
>>>>            rmap_walk_locked(dst, &rwc);
>>>>        else
>>>>            rmap_walk(dst, &rwc);
>>>> @@ -934,7 +988,7 @@ static int writeout(struct address_space
>>>> *mapping, struct folio *folio)
>>>>         * At this point we know that the migration attempt cannot
>>>>         * be successful.
>>>>         */
>>>> -    remove_migration_ptes(folio, folio, false);
>>>> +    remove_migration_ptes(folio, folio, 0);
>>>>          rc = mapping->a_ops->writepage(&folio->page, &wbc);
>>>>    @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio
>>>> *src,
>>>>                       struct list_head *ret)
>>>>    {
>>>>        if (page_was_mapped)
>>>> -        remove_migration_ptes(src, src, false);
>>>> +        remove_migration_ptes(src, src, 0);
>>>>        /* Drop an anon_vma reference if we took one */
>>>>        if (anon_vma)
>>>>            put_anon_vma(anon_vma);
>>>> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t
>>>> put_new_folio, unsigned long private,
>>>>            lru_add_drain();
>>>>          if (old_page_state & PAGE_WAS_MAPPED)
>>>> -        remove_migration_ptes(src, dst, false);
>>>> +        remove_migration_ptes(src, dst, 0);
>>>>      out_unlock_both:
>>>>        folio_unlock(dst);
>>>> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t
>>>> get_new_folio,
>>>>          if (page_was_mapped)
>>>>            remove_migration_ptes(src,
>>>> -            rc == MIGRATEPAGE_SUCCESS ? dst : src,
>>>> false);
>>>> +            rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>>>>      unlock_put_anon:
>>>>        folio_unlock(dst);
>>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>>> index 8d687de88a03..9cf26592ac93 100644
>>>> --- a/mm/migrate_device.c
>>>> +++ b/mm/migrate_device.c
>>>> @@ -424,7 +424,7 @@ static unsigned long
>>>> migrate_device_unmap(unsigned long *src_pfns,
>>>>                continue;
>>>>              folio = page_folio(page);
>>>> -        remove_migration_ptes(folio, folio, false);
>>>> +        remove_migration_ptes(folio, folio, 0);
>>>>              src_pfns[i] = 0;
>>>>            folio_unlock(folio);
>>>> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long
>>>> *src_pfns,
>>>>                dst = src;
>>>>            }
>>>>    -        remove_migration_ptes(src, dst, false);
>>>> +        remove_migration_ptes(src, dst, 0);
>>>>            folio_unlock(src);
>>>>              if (folio_is_zone_device(src))
>>>
>>> Hi,
>>>
>>> This patch has been in the mainline for some time, but we recently
>>> discovered an issue when both mTHP and MTE (Memory Tagging Extension)
>>> are enabled.
>>>
>>> It seems that remapping to the same zeropage might causes MTE tag
>>> mismatches, since MTE tags are associated with physical addresses.
>>
>> Does this only trigger when the VMA has mte enabled? Maybe we'll have to bail out if we detect that mte is enabled.
>>
> 
> I believe MTE is all or nothing? i.e. all the memory is tagged when enabled, but will let the arm folks confirm.
> 

I think for a process it's only active when PROT_MTE was called.

In khugepaged, I recall that we use 
copy_mc_user_highpage()->copy_user_highpage() that translates on arm64 
to its custom copy_highpage() where tags are copied as well.

So that's why khugepaged works.

KSM? Hmmm. Not sure how that is fenced off. We'd likely need some hidden 
page_mte_tagged() check somewhere.

-- 
Cheers

David / dhildenb

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by Zi Yan 1 year, 3 months ago

On 30 Aug 2024, at 6:03, Usama Arif wrote:

> From: Yu Zhao <yuzhao@google.com>
>
> Here being unused means containing only zeros and inaccessible to
> userspace. When splitting an isolated thp under reclaim or migration,
> the unused subpages can be mapped to the shared zeropage, hence saving
> memory. This is particularly helpful when the internal
> fragmentation of a thp is high, i.e. it has many untouched subpages.
>
> This is also a prerequisite for THP low utilization shrinker which will
> be introduced in later patches, where underutilized THPs are split, and
> the zero-filled pages are freed saving memory.
>
> Signed-off-by: Yu Zhao <yuzhao@google.com>
> Tested-by: Shuang Zhai <zhais@google.com>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
>  include/linux/rmap.h |  7 ++++-
>  mm/huge_memory.c     |  8 ++---
>  mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++------
>  mm/migrate_device.c  |  4 +--
>  4 files changed, 75 insertions(+), 16 deletions(-)
>
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 91b5935e8485..d5e93e44322e 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>  int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
>  		      struct vm_area_struct *vma);
>
> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
> +enum rmp_flags {
> +	RMP_LOCKED		= 1 << 0,
> +	RMP_USE_SHARED_ZEROPAGE	= 1 << 1,
> +};
> +
> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
>
>  /*
>   * rmap_walk_control: To control rmap traversing for specific needs
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 0c48806ccb9a..af60684e7c70 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
>  	return false;
>  }
>
> -static void remap_page(struct folio *folio, unsigned long nr)
> +static void remap_page(struct folio *folio, unsigned long nr, int flags)
>  {
>  	int i = 0;
>
> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
>  	if (!folio_test_anon(folio))
>  		return;
>  	for (;;) {
> -		remove_migration_ptes(folio, folio, true);
> +		remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
>  		i += folio_nr_pages(folio);
>  		if (i >= nr)
>  			break;
> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>
>  	if (nr_dropped)
>  		shmem_uncharge(folio->mapping->host, nr_dropped);
> -	remap_page(folio, nr);
> +	remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
>
>  	/*
>  	 * set page to its compound_head when split to non order-0 pages, so
> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>  		if (mapping)
>  			xas_unlock(&xas);
>  		local_irq_enable();
> -		remap_page(folio, folio_nr_pages(folio));
> +		remap_page(folio, folio_nr_pages(folio), 0);
>  		ret = -EAGAIN;
>  	}
>
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 6f9c62c746be..d039863e014b 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
>  	return true;
>  }
>
> +static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
> +					  struct folio *folio,
> +					  unsigned long idx)
> +{
> +	struct page *page = folio_page(folio, idx);
> +	bool contains_data;
> +	pte_t newpte;
> +	void *addr;
> +
> +	VM_BUG_ON_PAGE(PageCompound(page), page);

This should be:

diff --git a/mm/migrate.c b/mm/migrate.c
index e950fd62607f..7ffdbe078aa7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -206,7 +206,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
        pte_t newpte;
        void *addr;

-       VM_BUG_ON_PAGE(PageCompound(page), page);
+       if (PageCompound(page))
+               return false;
        VM_BUG_ON_PAGE(!PageAnon(page), page);
        VM_BUG_ON_PAGE(!PageLocked(page), page);
        VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);

Otherwise, splitting anonymous large folios to non order-0 ones just
triggers this BUG_ON.

> +	VM_BUG_ON_PAGE(!PageAnon(page), page);
> +	VM_BUG_ON_PAGE(!PageLocked(page), page);
> +	VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
> +
> +	if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
> +	    mm_forbids_zeropage(pvmw->vma->vm_mm))
> +		return false;
> +
> +	/*
> +	 * The pmd entry mapping the old thp was flushed and the pte mapping
> +	 * this subpage has been non present. If the subpage is only zero-filled
> +	 * then map it to the shared zeropage.
> +	 */
> +	addr = kmap_local_page(page);
> +	contains_data = memchr_inv(addr, 0, PAGE_SIZE);
> +	kunmap_local(addr);
> +
> +	if (contains_data)
> +		return false;
> +
> +	newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
> +					pvmw->vma->vm_page_prot));
> +	set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
> +
> +	dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
> +	return true;
> +}
> +
> +struct rmap_walk_arg {
> +	struct folio *folio;
> +	bool map_unused_to_zeropage;
> +};
> +
>  /*
>   * Restore a potential migration pte to a working pte entry
>   */
>  static bool remove_migration_pte(struct folio *folio,
> -		struct vm_area_struct *vma, unsigned long addr, void *old)
> +		struct vm_area_struct *vma, unsigned long addr, void *arg)
>  {
> -	DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
> +	struct rmap_walk_arg *rmap_walk_arg = arg;
> +	DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
>
>  	while (page_vma_mapped_walk(&pvmw)) {
>  		rmap_t rmap_flags = RMAP_NONE;
> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio *folio,
>  			continue;
>  		}
>  #endif
> +		if (rmap_walk_arg->map_unused_to_zeropage &&
> +		    try_to_map_unused_to_zeropage(&pvmw, folio, idx))
> +			continue;
>
>  		folio_get(folio);
>  		pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio *folio,
>   * Get rid of all migration entries and replace them by
>   * references to the indicated page.
>   */
> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
>  {
> +	struct rmap_walk_arg rmap_walk_arg = {
> +		.folio = src,
> +		.map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
> +	};
> +
>  	struct rmap_walk_control rwc = {
>  		.rmap_one = remove_migration_pte,
> -		.arg = src,
> +		.arg = &rmap_walk_arg,
>  	};
>
> -	if (locked)
> +	VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
> +
> +	if (flags & RMP_LOCKED)
>  		rmap_walk_locked(dst, &rwc);
>  	else
>  		rmap_walk(dst, &rwc);
> @@ -934,7 +988,7 @@ static int writeout(struct address_space *mapping, struct folio *folio)
>  	 * At this point we know that the migration attempt cannot
>  	 * be successful.
>  	 */
> -	remove_migration_ptes(folio, folio, false);
> +	remove_migration_ptes(folio, folio, 0);
>
>  	rc = mapping->a_ops->writepage(&folio->page, &wbc);
>
> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio *src,
>  				   struct list_head *ret)
>  {
>  	if (page_was_mapped)
> -		remove_migration_ptes(src, src, false);
> +		remove_migration_ptes(src, src, 0);
>  	/* Drop an anon_vma reference if we took one */
>  	if (anon_vma)
>  		put_anon_vma(anon_vma);
> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
>  		lru_add_drain();
>
>  	if (old_page_state & PAGE_WAS_MAPPED)
> -		remove_migration_ptes(src, dst, false);
> +		remove_migration_ptes(src, dst, 0);
>
>  out_unlock_both:
>  	folio_unlock(dst);
> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
>
>  	if (page_was_mapped)
>  		remove_migration_ptes(src,
> -			rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
> +			rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>
>  unlock_put_anon:
>  	folio_unlock(dst);
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index 8d687de88a03..9cf26592ac93 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -424,7 +424,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
>  			continue;
>
>  		folio = page_folio(page);
> -		remove_migration_ptes(folio, folio, false);
> +		remove_migration_ptes(folio, folio, 0);
>
>  		src_pfns[i] = 0;
>  		folio_unlock(folio);
> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
>  			dst = src;
>  		}
>
> -		remove_migration_ptes(src, dst, false);
> +		remove_migration_ptes(src, dst, 0);
>  		folio_unlock(src);
>
>  		if (folio_is_zone_device(src))
> -- 
> 2.43.5

Best Regards,
Yan, Zi

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by Usama Arif 1 year, 3 months ago


On 23/10/2024 17:21, Zi Yan wrote:
> On 30 Aug 2024, at 6:03, Usama Arif wrote:
> 
>> From: Yu Zhao <yuzhao@google.com>
>>
>> Here being unused means containing only zeros and inaccessible to
>> userspace. When splitting an isolated thp under reclaim or migration,
>> the unused subpages can be mapped to the shared zeropage, hence saving
>> memory. This is particularly helpful when the internal
>> fragmentation of a thp is high, i.e. it has many untouched subpages.
>>
>> This is also a prerequisite for THP low utilization shrinker which will
>> be introduced in later patches, where underutilized THPs are split, and
>> the zero-filled pages are freed saving memory.
>>
>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>> Tested-by: Shuang Zhai <zhais@google.com>
>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>> ---
>>  include/linux/rmap.h |  7 ++++-
>>  mm/huge_memory.c     |  8 ++---
>>  mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++------
>>  mm/migrate_device.c  |  4 +--
>>  4 files changed, 75 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>> index 91b5935e8485..d5e93e44322e 100644
>> --- a/include/linux/rmap.h
>> +++ b/include/linux/rmap.h
>> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>>  int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
>>  		      struct vm_area_struct *vma);
>>
>> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
>> +enum rmp_flags {
>> +	RMP_LOCKED		= 1 << 0,
>> +	RMP_USE_SHARED_ZEROPAGE	= 1 << 1,
>> +};
>> +
>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
>>
>>  /*
>>   * rmap_walk_control: To control rmap traversing for specific needs
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 0c48806ccb9a..af60684e7c70 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
>>  	return false;
>>  }
>>
>> -static void remap_page(struct folio *folio, unsigned long nr)
>> +static void remap_page(struct folio *folio, unsigned long nr, int flags)
>>  {
>>  	int i = 0;
>>
>> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
>>  	if (!folio_test_anon(folio))
>>  		return;
>>  	for (;;) {
>> -		remove_migration_ptes(folio, folio, true);
>> +		remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
>>  		i += folio_nr_pages(folio);
>>  		if (i >= nr)
>>  			break;
>> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>
>>  	if (nr_dropped)
>>  		shmem_uncharge(folio->mapping->host, nr_dropped);
>> -	remap_page(folio, nr);
>> +	remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
>>
>>  	/*
>>  	 * set page to its compound_head when split to non order-0 pages, so
>> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>  		if (mapping)
>>  			xas_unlock(&xas);
>>  		local_irq_enable();
>> -		remap_page(folio, folio_nr_pages(folio));
>> +		remap_page(folio, folio_nr_pages(folio), 0);
>>  		ret = -EAGAIN;
>>  	}
>>
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 6f9c62c746be..d039863e014b 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
>>  	return true;
>>  }
>>
>> +static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
>> +					  struct folio *folio,
>> +					  unsigned long idx)
>> +{
>> +	struct page *page = folio_page(folio, idx);
>> +	bool contains_data;
>> +	pte_t newpte;
>> +	void *addr;
>> +
>> +	VM_BUG_ON_PAGE(PageCompound(page), page);
> 
> This should be:
> 
> diff --git a/mm/migrate.c b/mm/migrate.c
> index e950fd62607f..7ffdbe078aa7 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -206,7 +206,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
>         pte_t newpte;
>         void *addr;
> 
> -       VM_BUG_ON_PAGE(PageCompound(page), page);
> +       if (PageCompound(page))
> +               return false;
>         VM_BUG_ON_PAGE(!PageAnon(page), page);
>         VM_BUG_ON_PAGE(!PageLocked(page), page);
>         VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
> 
> Otherwise, splitting anonymous large folios to non order-0 ones just
> triggers this BUG_ON.
> 

That makes sense, would you like to send the fix?

Adding Yu Zhao to "To" incase he has any objections.

Thanks,
Usama

>> +	VM_BUG_ON_PAGE(!PageAnon(page), page);
>> +	VM_BUG_ON_PAGE(!PageLocked(page), page);
>> +	VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>> +
>> +	if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
>> +	    mm_forbids_zeropage(pvmw->vma->vm_mm))
>> +		return false;
>> +
>> +	/*
>> +	 * The pmd entry mapping the old thp was flushed and the pte mapping
>> +	 * this subpage has been non present. If the subpage is only zero-filled
>> +	 * then map it to the shared zeropage.
>> +	 */
>> +	addr = kmap_local_page(page);
>> +	contains_data = memchr_inv(addr, 0, PAGE_SIZE);
>> +	kunmap_local(addr);
>> +
>> +	if (contains_data)
>> +		return false;
>> +
>> +	newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
>> +					pvmw->vma->vm_page_prot));
>> +	set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
>> +
>> +	dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
>> +	return true;
>> +}
>> +
>> +struct rmap_walk_arg {
>> +	struct folio *folio;
>> +	bool map_unused_to_zeropage;
>> +};
>> +
>>  /*
>>   * Restore a potential migration pte to a working pte entry
>>   */
>>  static bool remove_migration_pte(struct folio *folio,
>> -		struct vm_area_struct *vma, unsigned long addr, void *old)
>> +		struct vm_area_struct *vma, unsigned long addr, void *arg)
>>  {
>> -	DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
>> +	struct rmap_walk_arg *rmap_walk_arg = arg;
>> +	DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
>>
>>  	while (page_vma_mapped_walk(&pvmw)) {
>>  		rmap_t rmap_flags = RMAP_NONE;
>> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio *folio,
>>  			continue;
>>  		}
>>  #endif
>> +		if (rmap_walk_arg->map_unused_to_zeropage &&
>> +		    try_to_map_unused_to_zeropage(&pvmw, folio, idx))
>> +			continue;
>>
>>  		folio_get(folio);
>>  		pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
>> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio *folio,
>>   * Get rid of all migration entries and replace them by
>>   * references to the indicated page.
>>   */
>> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
>>  {
>> +	struct rmap_walk_arg rmap_walk_arg = {
>> +		.folio = src,
>> +		.map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
>> +	};
>> +
>>  	struct rmap_walk_control rwc = {
>>  		.rmap_one = remove_migration_pte,
>> -		.arg = src,
>> +		.arg = &rmap_walk_arg,
>>  	};
>>
>> -	if (locked)
>> +	VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
>> +
>> +	if (flags & RMP_LOCKED)
>>  		rmap_walk_locked(dst, &rwc);
>>  	else
>>  		rmap_walk(dst, &rwc);
>> @@ -934,7 +988,7 @@ static int writeout(struct address_space *mapping, struct folio *folio)
>>  	 * At this point we know that the migration attempt cannot
>>  	 * be successful.
>>  	 */
>> -	remove_migration_ptes(folio, folio, false);
>> +	remove_migration_ptes(folio, folio, 0);
>>
>>  	rc = mapping->a_ops->writepage(&folio->page, &wbc);
>>
>> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio *src,
>>  				   struct list_head *ret)
>>  {
>>  	if (page_was_mapped)
>> -		remove_migration_ptes(src, src, false);
>> +		remove_migration_ptes(src, src, 0);
>>  	/* Drop an anon_vma reference if we took one */
>>  	if (anon_vma)
>>  		put_anon_vma(anon_vma);
>> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
>>  		lru_add_drain();
>>
>>  	if (old_page_state & PAGE_WAS_MAPPED)
>> -		remove_migration_ptes(src, dst, false);
>> +		remove_migration_ptes(src, dst, 0);
>>
>>  out_unlock_both:
>>  	folio_unlock(dst);
>> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
>>
>>  	if (page_was_mapped)
>>  		remove_migration_ptes(src,
>> -			rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
>> +			rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>>
>>  unlock_put_anon:
>>  	folio_unlock(dst);
>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> index 8d687de88a03..9cf26592ac93 100644
>> --- a/mm/migrate_device.c
>> +++ b/mm/migrate_device.c
>> @@ -424,7 +424,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
>>  			continue;
>>
>>  		folio = page_folio(page);
>> -		remove_migration_ptes(folio, folio, false);
>> +		remove_migration_ptes(folio, folio, 0);
>>
>>  		src_pfns[i] = 0;
>>  		folio_unlock(folio);
>> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
>>  			dst = src;
>>  		}
>>
>> -		remove_migration_ptes(src, dst, false);
>> +		remove_migration_ptes(src, dst, 0);
>>  		folio_unlock(src);
>>
>>  		if (folio_is_zone_device(src))
>> -- 
>> 2.43.5
> 
> Best Regards,
> Yan, Zi

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by Yu Zhao 1 year, 3 months ago

On Wed, Oct 23, 2024 at 10:51 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
> On 23/10/2024 17:21, Zi Yan wrote:
> > On 30 Aug 2024, at 6:03, Usama Arif wrote:
> >
> >> From: Yu Zhao <yuzhao@google.com>
> >>
> >> Here being unused means containing only zeros and inaccessible to
> >> userspace. When splitting an isolated thp under reclaim or migration,
> >> the unused subpages can be mapped to the shared zeropage, hence saving
> >> memory. This is particularly helpful when the internal
> >> fragmentation of a thp is high, i.e. it has many untouched subpages.
> >>
> >> This is also a prerequisite for THP low utilization shrinker which will
> >> be introduced in later patches, where underutilized THPs are split, and
> >> the zero-filled pages are freed saving memory.
> >>
> >> Signed-off-by: Yu Zhao <yuzhao@google.com>
> >> Tested-by: Shuang Zhai <zhais@google.com>
> >> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >> ---
> >>  include/linux/rmap.h |  7 ++++-
> >>  mm/huge_memory.c     |  8 ++---
> >>  mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++------
> >>  mm/migrate_device.c  |  4 +--
> >>  4 files changed, 75 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> >> index 91b5935e8485..d5e93e44322e 100644
> >> --- a/include/linux/rmap.h
> >> +++ b/include/linux/rmap.h
> >> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
> >>  int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
> >>                    struct vm_area_struct *vma);
> >>
> >> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
> >> +enum rmp_flags {
> >> +    RMP_LOCKED              = 1 << 0,
> >> +    RMP_USE_SHARED_ZEROPAGE = 1 << 1,
> >> +};
> >> +
> >> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
> >>
> >>  /*
> >>   * rmap_walk_control: To control rmap traversing for specific needs
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 0c48806ccb9a..af60684e7c70 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
> >>      return false;
> >>  }
> >>
> >> -static void remap_page(struct folio *folio, unsigned long nr)
> >> +static void remap_page(struct folio *folio, unsigned long nr, int flags)
> >>  {
> >>      int i = 0;
> >>
> >> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
> >>      if (!folio_test_anon(folio))
> >>              return;
> >>      for (;;) {
> >> -            remove_migration_ptes(folio, folio, true);
> >> +            remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
> >>              i += folio_nr_pages(folio);
> >>              if (i >= nr)
> >>                      break;
> >> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
> >>
> >>      if (nr_dropped)
> >>              shmem_uncharge(folio->mapping->host, nr_dropped);
> >> -    remap_page(folio, nr);
> >> +    remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
> >>
> >>      /*
> >>       * set page to its compound_head when split to non order-0 pages, so
> >> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >>              if (mapping)
> >>                      xas_unlock(&xas);
> >>              local_irq_enable();
> >> -            remap_page(folio, folio_nr_pages(folio));
> >> +            remap_page(folio, folio_nr_pages(folio), 0);
> >>              ret = -EAGAIN;
> >>      }
> >>
> >> diff --git a/mm/migrate.c b/mm/migrate.c
> >> index 6f9c62c746be..d039863e014b 100644
> >> --- a/mm/migrate.c
> >> +++ b/mm/migrate.c
> >> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
> >>      return true;
> >>  }
> >>
> >> +static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
> >> +                                      struct folio *folio,
> >> +                                      unsigned long idx)
> >> +{
> >> +    struct page *page = folio_page(folio, idx);
> >> +    bool contains_data;
> >> +    pte_t newpte;
> >> +    void *addr;
> >> +
> >> +    VM_BUG_ON_PAGE(PageCompound(page), page);
> >
> > This should be:
> >
> > diff --git a/mm/migrate.c b/mm/migrate.c
> > index e950fd62607f..7ffdbe078aa7 100644
> > --- a/mm/migrate.c
> > +++ b/mm/migrate.c
> > @@ -206,7 +206,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
> >         pte_t newpte;
> >         void *addr;
> >
> > -       VM_BUG_ON_PAGE(PageCompound(page), page);
> > +       if (PageCompound(page))
> > +               return false;
> >         VM_BUG_ON_PAGE(!PageAnon(page), page);
> >         VM_BUG_ON_PAGE(!PageLocked(page), page);
> >         VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
> >
> > Otherwise, splitting anonymous large folios to non order-0 ones just
> > triggers this BUG_ON.
> >
>
> That makes sense, would you like to send the fix?
>
> Adding Yu Zhao to "To" incase he has any objections.

LGTM. Thanks!

Re: [PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp

Posted by Zi Yan 1 year, 3 months ago

On 23 Oct 2024, at 12:50, Usama Arif wrote:

> On 23/10/2024 17:21, Zi Yan wrote:
>> On 30 Aug 2024, at 6:03, Usama Arif wrote:
>>
>>> From: Yu Zhao <yuzhao@google.com>
>>>
>>> Here being unused means containing only zeros and inaccessible to
>>> userspace. When splitting an isolated thp under reclaim or migration,
>>> the unused subpages can be mapped to the shared zeropage, hence saving
>>> memory. This is particularly helpful when the internal
>>> fragmentation of a thp is high, i.e. it has many untouched subpages.
>>>
>>> This is also a prerequisite for THP low utilization shrinker which will
>>> be introduced in later patches, where underutilized THPs are split, and
>>> the zero-filled pages are freed saving memory.
>>>
>>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>>> Tested-by: Shuang Zhai <zhais@google.com>
>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>> ---
>>>  include/linux/rmap.h |  7 ++++-
>>>  mm/huge_memory.c     |  8 ++---
>>>  mm/migrate.c         | 72 ++++++++++++++++++++++++++++++++++++++------
>>>  mm/migrate_device.c  |  4 +--
>>>  4 files changed, 75 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>> index 91b5935e8485..d5e93e44322e 100644
>>> --- a/include/linux/rmap.h
>>> +++ b/include/linux/rmap.h
>>> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>>>  int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
>>>  		      struct vm_area_struct *vma);
>>>
>>> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
>>> +enum rmp_flags {
>>> +	RMP_LOCKED		= 1 << 0,
>>> +	RMP_USE_SHARED_ZEROPAGE	= 1 << 1,
>>> +};
>>> +
>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
>>>
>>>  /*
>>>   * rmap_walk_control: To control rmap traversing for specific needs
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 0c48806ccb9a..af60684e7c70 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
>>>  	return false;
>>>  }
>>>
>>> -static void remap_page(struct folio *folio, unsigned long nr)
>>> +static void remap_page(struct folio *folio, unsigned long nr, int flags)
>>>  {
>>>  	int i = 0;
>>>
>>> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
>>>  	if (!folio_test_anon(folio))
>>>  		return;
>>>  	for (;;) {
>>> -		remove_migration_ptes(folio, folio, true);
>>> +		remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
>>>  		i += folio_nr_pages(folio);
>>>  		if (i >= nr)
>>>  			break;
>>> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>>
>>>  	if (nr_dropped)
>>>  		shmem_uncharge(folio->mapping->host, nr_dropped);
>>> -	remap_page(folio, nr);
>>> +	remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
>>>
>>>  	/*
>>>  	 * set page to its compound_head when split to non order-0 pages, so
>>> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>  		if (mapping)
>>>  			xas_unlock(&xas);
>>>  		local_irq_enable();
>>> -		remap_page(folio, folio_nr_pages(folio));
>>> +		remap_page(folio, folio_nr_pages(folio), 0);
>>>  		ret = -EAGAIN;
>>>  	}
>>>
>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>> index 6f9c62c746be..d039863e014b 100644
>>> --- a/mm/migrate.c
>>> +++ b/mm/migrate.c
>>> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
>>>  	return true;
>>>  }
>>>
>>> +static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
>>> +					  struct folio *folio,
>>> +					  unsigned long idx)
>>> +{
>>> +	struct page *page = folio_page(folio, idx);
>>> +	bool contains_data;
>>> +	pte_t newpte;
>>> +	void *addr;
>>> +
>>> +	VM_BUG_ON_PAGE(PageCompound(page), page);
>>
>> This should be:
>>
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index e950fd62607f..7ffdbe078aa7 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -206,7 +206,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
>>         pte_t newpte;
>>         void *addr;
>>
>> -       VM_BUG_ON_PAGE(PageCompound(page), page);
>> +       if (PageCompound(page))
>> +               return false;
>>         VM_BUG_ON_PAGE(!PageAnon(page), page);
>>         VM_BUG_ON_PAGE(!PageLocked(page), page);
>>         VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>>
>> Otherwise, splitting anonymous large folios to non order-0 ones just
>> triggers this BUG_ON.
>>
>
> That makes sense, would you like to send the fix?
>
> Adding Yu Zhao to "To" incase he has any objections.
>
Sure, will do.

Best Regards,
Yan, Zi

[PATCH v5 1/6] mm: free zapped tail pages when splitting isolated thp
[PATCH v5 2/6] mm: remap unused subpages to shared zeropage when splitting isolated thp
[PATCH v5 3/6] mm: selftest to verify zero-filled pages are mapped to zeropage
[PATCH v5 4/6] mm: Introduce a pageflag for partially mapped folios
[PATCH v5 5/6] mm: split underused THPs
[PATCH v5 6/6] mm: add sysfs entry to disable splitting underused THPs