From: Yu Zhao <yuzhao@google.com>
Here being unused means containing only zeros and inaccessible to
userspace. When splitting an isolated thp under reclaim or migration,
the unused subpages can be mapped to the shared zeropage, hence saving
memory. This is particularly helpful when the internal
fragmentation of a thp is high, i.e. it has many untouched subpages.
This is also a prerequisite for THP low utilization shrinker which will
be introduced in later patches, where underutilized THPs are split, and
the zero-filled pages are freed saving memory.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Shuang Zhai <zhais@google.com>
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
include/linux/rmap.h | 7 ++++-
mm/huge_memory.c | 8 ++---
mm/migrate.c | 72 ++++++++++++++++++++++++++++++++++++++------
mm/migrate_device.c | 4 +--
4 files changed, 75 insertions(+), 16 deletions(-)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 91b5935e8485..d5e93e44322e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
struct vm_area_struct *vma);
-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
+enum rmp_flags {
+ RMP_LOCKED = 1 << 0,
+ RMP_USE_SHARED_ZEROPAGE = 1 << 1,
+};
+
+void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
/*
* rmap_walk_control: To control rmap traversing for specific needs
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0c48806ccb9a..af60684e7c70 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
return false;
}
-static void remap_page(struct folio *folio, unsigned long nr)
+static void remap_page(struct folio *folio, unsigned long nr, int flags)
{
int i = 0;
@@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
if (!folio_test_anon(folio))
return;
for (;;) {
- remove_migration_ptes(folio, folio, true);
+ remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
i += folio_nr_pages(folio);
if (i >= nr)
break;
@@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
if (nr_dropped)
shmem_uncharge(folio->mapping->host, nr_dropped);
- remap_page(folio, nr);
+ remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
/*
* set page to its compound_head when split to non order-0 pages, so
@@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
if (mapping)
xas_unlock(&xas);
local_irq_enable();
- remap_page(folio, folio_nr_pages(folio));
+ remap_page(folio, folio_nr_pages(folio), 0);
ret = -EAGAIN;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 6f9c62c746be..d039863e014b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
return true;
}
+static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
+ struct folio *folio,
+ unsigned long idx)
+{
+ struct page *page = folio_page(folio, idx);
+ bool contains_data;
+ pte_t newpte;
+ void *addr;
+
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
+
+ if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
+ mm_forbids_zeropage(pvmw->vma->vm_mm))
+ return false;
+
+ /*
+ * The pmd entry mapping the old thp was flushed and the pte mapping
+ * this subpage has been non present. If the subpage is only zero-filled
+ * then map it to the shared zeropage.
+ */
+ addr = kmap_local_page(page);
+ contains_data = memchr_inv(addr, 0, PAGE_SIZE);
+ kunmap_local(addr);
+
+ if (contains_data)
+ return false;
+
+ newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
+ pvmw->vma->vm_page_prot));
+ set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
+
+ dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
+ return true;
+}
+
+struct rmap_walk_arg {
+ struct folio *folio;
+ bool map_unused_to_zeropage;
+};
+
/*
* Restore a potential migration pte to a working pte entry
*/
static bool remove_migration_pte(struct folio *folio,
- struct vm_area_struct *vma, unsigned long addr, void *old)
+ struct vm_area_struct *vma, unsigned long addr, void *arg)
{
- DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
+ struct rmap_walk_arg *rmap_walk_arg = arg;
+ DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
while (page_vma_mapped_walk(&pvmw)) {
rmap_t rmap_flags = RMAP_NONE;
@@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio *folio,
continue;
}
#endif
+ if (rmap_walk_arg->map_unused_to_zeropage &&
+ try_to_map_unused_to_zeropage(&pvmw, folio, idx))
+ continue;
folio_get(folio);
pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
@@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio *folio,
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
+void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
{
+ struct rmap_walk_arg rmap_walk_arg = {
+ .folio = src,
+ .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
+ };
+
struct rmap_walk_control rwc = {
.rmap_one = remove_migration_pte,
- .arg = src,
+ .arg = &rmap_walk_arg,
};
- if (locked)
+ VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
+
+ if (flags & RMP_LOCKED)
rmap_walk_locked(dst, &rwc);
else
rmap_walk(dst, &rwc);
@@ -934,7 +988,7 @@ static int writeout(struct address_space *mapping, struct folio *folio)
* At this point we know that the migration attempt cannot
* be successful.
*/
- remove_migration_ptes(folio, folio, false);
+ remove_migration_ptes(folio, folio, 0);
rc = mapping->a_ops->writepage(&folio->page, &wbc);
@@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio *src,
struct list_head *ret)
{
if (page_was_mapped)
- remove_migration_ptes(src, src, false);
+ remove_migration_ptes(src, src, 0);
/* Drop an anon_vma reference if we took one */
if (anon_vma)
put_anon_vma(anon_vma);
@@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
lru_add_drain();
if (old_page_state & PAGE_WAS_MAPPED)
- remove_migration_ptes(src, dst, false);
+ remove_migration_ptes(src, dst, 0);
out_unlock_both:
folio_unlock(dst);
@@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
if (page_was_mapped)
remove_migration_ptes(src,
- rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
+ rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
unlock_put_anon:
folio_unlock(dst);
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 8d687de88a03..9cf26592ac93 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -424,7 +424,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
continue;
folio = page_folio(page);
- remove_migration_ptes(folio, folio, false);
+ remove_migration_ptes(folio, folio, 0);
src_pfns[i] = 0;
folio_unlock(folio);
@@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
dst = src;
}
- remove_migration_ptes(src, dst, false);
+ remove_migration_ptes(src, dst, 0);
folio_unlock(src);
if (folio_is_zone_device(src))
--
2.43.5
On Fri, 2024-08-30 at 11:03 +0100, Usama Arif wrote:
> From: Yu Zhao <yuzhao@google.com>
>
> Here being unused means containing only zeros and inaccessible to
> userspace. When splitting an isolated thp under reclaim or migration,
> the unused subpages can be mapped to the shared zeropage, hence
> saving
> memory. This is particularly helpful when the internal
> fragmentation of a thp is high, i.e. it has many untouched subpages.
>
> This is also a prerequisite for THP low utilization shrinker which
> will
> be introduced in later patches, where underutilized THPs are split,
> and
> the zero-filled pages are freed saving memory.
>
> Signed-off-by: Yu Zhao <yuzhao@google.com>
> Tested-by: Shuang Zhai <zhais@google.com>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
> include/linux/rmap.h | 7 ++++-
> mm/huge_memory.c | 8 ++---
> mm/migrate.c | 72 ++++++++++++++++++++++++++++++++++++++----
> --
> mm/migrate_device.c | 4 +--
> 4 files changed, 75 insertions(+), 16 deletions(-)
>
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 91b5935e8485..d5e93e44322e 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
> int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages,
> pgoff_t pgoff,
> struct vm_area_struct *vma);
>
> -void remove_migration_ptes(struct folio *src, struct folio *dst,
> bool locked);
> +enum rmp_flags {
> + RMP_LOCKED = 1 << 0,
> + RMP_USE_SHARED_ZEROPAGE = 1 << 1,
> +};
> +
> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
> flags);
>
> /*
> * rmap_walk_control: To control rmap traversing for specific needs
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 0c48806ccb9a..af60684e7c70 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct
> vm_area_struct *vma, unsigned long addr,
> return false;
> }
>
> -static void remap_page(struct folio *folio, unsigned long nr)
> +static void remap_page(struct folio *folio, unsigned long nr, int
> flags)
> {
> int i = 0;
>
> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio,
> unsigned long nr)
> if (!folio_test_anon(folio))
> return;
> for (;;) {
> - remove_migration_ptes(folio, folio, true);
> + remove_migration_ptes(folio, folio, RMP_LOCKED |
> flags);
> i += folio_nr_pages(folio);
> if (i >= nr)
> break;
> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page
> *page, struct list_head *list,
>
> if (nr_dropped)
> shmem_uncharge(folio->mapping->host, nr_dropped);
> - remap_page(folio, nr);
> + remap_page(folio, nr, PageAnon(head) ?
> RMP_USE_SHARED_ZEROPAGE : 0);
>
> /*
> * set page to its compound_head when split to non order-0
> pages, so
> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct
> page *page, struct list_head *list,
> if (mapping)
> xas_unlock(&xas);
> local_irq_enable();
> - remap_page(folio, folio_nr_pages(folio));
> + remap_page(folio, folio_nr_pages(folio), 0);
> ret = -EAGAIN;
> }
>
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 6f9c62c746be..d039863e014b 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio,
> struct list_head *list)
> return true;
> }
>
> +static bool try_to_map_unused_to_zeropage(struct
> page_vma_mapped_walk *pvmw,
> + struct folio *folio,
> + unsigned long idx)
> +{
> + struct page *page = folio_page(folio, idx);
> + bool contains_data;
> + pte_t newpte;
> + void *addr;
> +
> + VM_BUG_ON_PAGE(PageCompound(page), page);
> + VM_BUG_ON_PAGE(!PageAnon(page), page);
> + VM_BUG_ON_PAGE(!PageLocked(page), page);
> + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
> +
> + if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags &
> VM_LOCKED) ||
> + mm_forbids_zeropage(pvmw->vma->vm_mm))
> + return false;
> +
> + /*
> + * The pmd entry mapping the old thp was flushed and the pte
> mapping
> + * this subpage has been non present. If the subpage is only
> zero-filled
> + * then map it to the shared zeropage.
> + */
> + addr = kmap_local_page(page);
> + contains_data = memchr_inv(addr, 0, PAGE_SIZE);
> + kunmap_local(addr);
> +
> + if (contains_data)
> + return false;
> +
> + newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
> + pvmw->vma->vm_page_prot));
> + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte,
> newpte);
> +
> + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
> + return true;
> +}
> +
> +struct rmap_walk_arg {
> + struct folio *folio;
> + bool map_unused_to_zeropage;
> +};
> +
> /*
> * Restore a potential migration pte to a working pte entry
> */
> static bool remove_migration_pte(struct folio *folio,
> - struct vm_area_struct *vma, unsigned long addr, void
> *old)
> + struct vm_area_struct *vma, unsigned long addr, void
> *arg)
> {
> - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC |
> PVMW_MIGRATION);
> + struct rmap_walk_arg *rmap_walk_arg = arg;
> + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr,
> PVMW_SYNC | PVMW_MIGRATION);
>
> while (page_vma_mapped_walk(&pvmw)) {
> rmap_t rmap_flags = RMAP_NONE;
> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio
> *folio,
> continue;
> }
> #endif
> + if (rmap_walk_arg->map_unused_to_zeropage &&
> + try_to_map_unused_to_zeropage(&pvmw, folio,
> idx))
> + continue;
>
> folio_get(folio);
> pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio
> *folio,
> * Get rid of all migration entries and replace them by
> * references to the indicated page.
> */
> -void remove_migration_ptes(struct folio *src, struct folio *dst,
> bool locked)
> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
> flags)
> {
> + struct rmap_walk_arg rmap_walk_arg = {
> + .folio = src,
> + .map_unused_to_zeropage = flags &
> RMP_USE_SHARED_ZEROPAGE,
> + };
> +
> struct rmap_walk_control rwc = {
> .rmap_one = remove_migration_pte,
> - .arg = src,
> + .arg = &rmap_walk_arg,
> };
>
> - if (locked)
> + VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src !=
> dst), src);
> +
> + if (flags & RMP_LOCKED)
> rmap_walk_locked(dst, &rwc);
> else
> rmap_walk(dst, &rwc);
> @@ -934,7 +988,7 @@ static int writeout(struct address_space
> *mapping, struct folio *folio)
> * At this point we know that the migration attempt cannot
> * be successful.
> */
> - remove_migration_ptes(folio, folio, false);
> + remove_migration_ptes(folio, folio, 0);
>
> rc = mapping->a_ops->writepage(&folio->page, &wbc);
>
> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio
> *src,
> struct list_head *ret)
> {
> if (page_was_mapped)
> - remove_migration_ptes(src, src, false);
> + remove_migration_ptes(src, src, 0);
> /* Drop an anon_vma reference if we took one */
> if (anon_vma)
> put_anon_vma(anon_vma);
> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t
> put_new_folio, unsigned long private,
> lru_add_drain();
>
> if (old_page_state & PAGE_WAS_MAPPED)
> - remove_migration_ptes(src, dst, false);
> + remove_migration_ptes(src, dst, 0);
>
> out_unlock_both:
> folio_unlock(dst);
> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t
> get_new_folio,
>
> if (page_was_mapped)
> remove_migration_ptes(src,
> - rc == MIGRATEPAGE_SUCCESS ? dst : src,
> false);
> + rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>
> unlock_put_anon:
> folio_unlock(dst);
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index 8d687de88a03..9cf26592ac93 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -424,7 +424,7 @@ static unsigned long
> migrate_device_unmap(unsigned long *src_pfns,
> continue;
>
> folio = page_folio(page);
> - remove_migration_ptes(folio, folio, false);
> + remove_migration_ptes(folio, folio, 0);
>
> src_pfns[i] = 0;
> folio_unlock(folio);
> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long
> *src_pfns,
> dst = src;
> }
>
> - remove_migration_ptes(src, dst, false);
> + remove_migration_ptes(src, dst, 0);
> folio_unlock(src);
>
> if (folio_is_zone_device(src))
Hi,
This patch has been in the mainline for some time, but we recently
discovered an issue when both mTHP and MTE (Memory Tagging Extension)
are enabled.
It seems that remapping to the same zeropage might causes MTE tag
mismatches, since MTE tags are associated with physical addresses.
In Android, the tombstone is as follows:
---
Build fingerprint:
'alps/vext_k6993v1_64/k6993v1_64:16/BP2A.250605.031.A3/mp1cs1ofp41:user
debug/dev-keys'
Revision: '0'
ABI: 'arm64'
Timestamp: 2025-08-12 04:58:28.507086720+0800
Process uptime: 0s
Cmdline: /system/bin/audioserver
pid: 8217, tid: 8882, name: binder:8217_4 >>> /system/bin/audioserver
<<<
uid: 1041
tagged_addr_ctrl: 000000000007fff3 (PR_TAGGED_ADDR_ENABLE,
PR_MTE_TCF_SYNC, mask 0xfffe)
signal 11 (SIGSEGV), code 9 (SEGV_MTESERR), fault addr
0x0a00007055220000
Cause: [MTE]: Buffer Overflow, 14016 bytes into a 23070-byte allocation
at 0x705521c940
x0 0a0000705521c940 x1 0300006f75210ab0 x2 00000000000022a5
x3 0a0000705521ffc0
x4 0300006f75212de5 x5 0a000070552222f5 x6 0000000000005a1e
x7 0000000000000000
x8 339c000005a1e11e x9 00000000f041339c x10 000000009dd48904
x11 000000000000ffff
x12 0000000022b70889 x13 000000004cc0b2ff x14 0000000000000000
x15 0000000000000010
x16 00000071cc5d8fc0 x17 00000071cc54e040 x18 0000006ef7bd4000
x19 0300006f7520d430
x20 00000071cc5e0340 x21 0000000000005a1e x22 0a0000705521c940
x23 00000000000059b5
x24 00000000000000b1 x25 0300006f75212e4e x26 caa20000059b511d
x27 000000000000001d
x28 0300006f75212e30 x29 0000006f1fe385f0
lr 00000071cc54200c sp 0000006f1fe385c0 pc 00000071cc54e158
pst 0000000020001000
26 total frames
backtrace:
#00 pc 000000000006c158
/apex/com.android.runtime/lib64/bionic/libc.so
(__memcpy_aarch64_simd+280) (BuildId: 1e819f3e369d59be98bee38a8fbd0322)
#01 pc 0000000000060008
/apex/com.android.runtime/lib64/bionic/libc.so
(scudo::Allocator<scudo::AndroidNormalConfig,
&scudo_malloc_postinit>::reallocate(void*, unsigned long, unsigned
long)+696) (BuildId: 1e819f3e369d59be98bee38a8fbd0322)
#02 pc 000000000005fccc
/apex/com.android.runtime/lib64/bionic/libc.so (scudo_realloc+44)
(BuildId: 1e819f3e369d59be98bee38a8fbd0322)
#03 pc 000000000005c2cc
/apex/com.android.runtime/lib64/bionic/libc.so (LimitRealloc(void*,
unsigned long)+124) (BuildId: 1e819f3e369d59be98bee38a8fbd0322)
#04 pc 0000000000059a90
/apex/com.android.runtime/lib64/bionic/libc.so (realloc+160) (BuildId:
1e819f3e369d59be98bee38a8fbd0322)
#05 pc 0000000000011a74 /system/lib64/libutils.so
(android::SharedBuffer::editResize(unsigned long) const+68) (BuildId:
7aa2d71e030a290c8dd28236ba0a838f)
#06 pc 0000000000011ba8 /system/lib64/libutils.so
(android::String8::real_append(char const*, unsigned long)+88)
(BuildId: 7aa2d71e030a290c8dd28236ba0a838f)
#07 pc 000000000007b880
/system/lib64/libaudiopolicycomponents.so
(android::DeviceDescriptor::dump(android::String8*, int, bool)
const+208) (BuildId: 553fefffdca2f3a5dde634e123bd2c81)
#08 pc 000000000008094c
/system/lib64/libaudiopolicycomponents.so
(android::DeviceVector::dump(android::String8*, android::String8
const&, int, bool) const+636) (BuildId:
553fefffdca2f3a5dde634e123bd2c81)
#09 pc 0000000000092ed4
/system/lib64/libaudiopolicycomponents.so
(android::IOProfile::dump(android::String8*, int) const+980) (BuildId:
553fefffdca2f3a5dde634e123bd2c81)
#10 pc 000000000008bd7c
/system/lib64/libaudiopolicycomponents.so
(android::HwModule::dump(android::String8*, int) const+1148) (BuildId:
553fefffdca2f3a5dde634e123bd2c81)
#11 pc 000000000009044c
/system/lib64/libaudiopolicycomponents.so
(android::HwModuleCollection::dump(android::String8*) const+508)
(BuildId: 553fefffdca2f3a5dde634e123bd2c81)
#12 pc 0000000000090134
/system/lib64/libaudiopolicymanagerdefault.so
(android::AudioPolicyManager::dump(android::String8*) const+3908)
(BuildId: fdba879fc1a0c470759bfeb3d594ab81)
#13 pc 0000000000092e40
/system/lib64/libaudiopolicymanagerdefault.so
(android::AudioPolicyManager::dump(int)+80) (BuildId:
fdba879fc1a0c470759bfeb3d594ab81)
#14 pc 000000000022b218 /system/bin/audioserver
(android::AudioPolicyService::dump(int,
android::Vector<android::String16> const&)+392) (BuildId:
1988c27ce74b125f598a07a93367cfdd)
#15 pc 000000000022c8cc /system/bin/audioserver (non-virtual
thunk to android::AudioPolicyService::dump(int,
android::Vector<android::String16> const&)+12) (BuildId:
1988c27ce74b125f598a07a93367cfdd)
#16 pc 00000000000883f4 /system/lib64/libbinder.so
(android::BBinder::onTransact(unsigned int, android::Parcel const&,
android::Parcel*, unsigned int)+340) (BuildId:
4ace0dcb0135b71ba70b7aaee457d26f)
#17 pc 000000000003fadc /system/lib64/audiopolicy-aidl-cpp.so
(android::media::BnAudioPolicyService::onTransact(unsigned int,
android::Parcel const&, android::Parcel*, unsigned int)+19884)
(BuildId: ae185d80e4e54668275f262317dc2d7d)
#18 pc 000000000022adc4 /system/bin/audioserver
(android::AudioPolicyService::onTransact(unsigned int, android::Parcel
const&, android::Parcel*, unsigned int)+1076) (BuildId:
1988c27ce74b125f598a07a93367cfdd)
#19 pc 0000000000048adc /system/lib64/libbinder.so
(android::IPCThreadState::executeCommand(int)+748) (BuildId:
4ace0dcb0135b71ba70b7aaee457d26f)
#20 pc 0000000000051788 /system/lib64/libbinder.so
(android::IPCThreadState::joinThreadPool(bool)+296) (BuildId:
4ace0dcb0135b71ba70b7aaee457d26f)
#21 pc 000000000007e528 /system/lib64/libbinder.so
(android::PoolThread::threadLoop()+24) (BuildId:
4ace0dcb0135b71ba70b7aaee457d26f)
#22 pc 0000000000019268 /system/lib64/libutils.so
(android::Thread::_threadLoop(void*)+248) (BuildId:
7aa2d71e030a290c8dd28236ba0a838f)
#23 pc 000000000001b994 /system/lib64/libutils.so
(libutil_thread_trampoline(void*)
(.__uniq.226528677032898775202282855395389835431)+20) (BuildId:
7aa2d71e030a290c8dd28236ba0a838f)
#24 pc 0000000000083c8c
/apex/com.android.runtime/lib64/bionic/libc.so
(__pthread_start(void*)+236) (BuildId:
1e819f3e369d59be98bee38a8fbd0322)
#25 pc 00000000000761a0
/apex/com.android.runtime/lib64/bionic/libc.so (__start_thread+64)
(BuildId: 1e819f3e369d59be98bee38a8fbd0322)
Memory tags around the fault address (0xa00007055220000), one tag per
16 bytes:
0x705521f800: a a a a a a a a a a a a a a a a
0x705521f900: a a a a a a a a a a a a a a a a
0x705521fa00: a a a a a a a a a a a a a a a a
0x705521fb00: a a a a a a a a a a a a a a a a
0x705521fc00: a a a a a a a a a a a a a a a a
0x705521fd00: a a a a a a a a a a a a a a a a
0x705521fe00: a a a a a a a a a a a a a a a a
0x705521ff00: a a a a a a a a a a a a a a a a
=>0x7055220000:[0] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0x7055220100: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0x7055220200: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0x7055220300: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0x7055220400: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0x7055220500: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0x7055220600: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0x7055220700: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
---
Whenever the memory pressure is high, it will happen to any process
with MTE enabled.
Any suggestion is appreciated.
Thanks,
Qun-wei
On 18.09.25 10:53, Qun-wei Lin (林群崴) wrote:
> On Fri, 2024-08-30 at 11:03 +0100, Usama Arif wrote:
>> From: Yu Zhao <yuzhao@google.com>
>>
>> Here being unused means containing only zeros and inaccessible to
>> userspace. When splitting an isolated thp under reclaim or migration,
>> the unused subpages can be mapped to the shared zeropage, hence
>> saving
>> memory. This is particularly helpful when the internal
>> fragmentation of a thp is high, i.e. it has many untouched subpages.
>>
>> This is also a prerequisite for THP low utilization shrinker which
>> will
>> be introduced in later patches, where underutilized THPs are split,
>> and
>> the zero-filled pages are freed saving memory.
>>
>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>> Tested-by: Shuang Zhai <zhais@google.com>
>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>> ---
>> include/linux/rmap.h | 7 ++++-
>> mm/huge_memory.c | 8 ++---
>> mm/migrate.c | 72 ++++++++++++++++++++++++++++++++++++++----
>> --
>> mm/migrate_device.c | 4 +--
>> 4 files changed, 75 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>> index 91b5935e8485..d5e93e44322e 100644
>> --- a/include/linux/rmap.h
>> +++ b/include/linux/rmap.h
>> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>> int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages,
>> pgoff_t pgoff,
>> struct vm_area_struct *vma);
>>
>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>> bool locked);
>> +enum rmp_flags {
>> + RMP_LOCKED = 1 << 0,
>> + RMP_USE_SHARED_ZEROPAGE = 1 << 1,
>> +};
>> +
>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>> flags);
>>
>> /*
>> * rmap_walk_control: To control rmap traversing for specific needs
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 0c48806ccb9a..af60684e7c70 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct
>> vm_area_struct *vma, unsigned long addr,
>> return false;
>> }
>>
>> -static void remap_page(struct folio *folio, unsigned long nr)
>> +static void remap_page(struct folio *folio, unsigned long nr, int
>> flags)
>> {
>> int i = 0;
>>
>> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio,
>> unsigned long nr)
>> if (!folio_test_anon(folio))
>> return;
>> for (;;) {
>> - remove_migration_ptes(folio, folio, true);
>> + remove_migration_ptes(folio, folio, RMP_LOCKED |
>> flags);
>> i += folio_nr_pages(folio);
>> if (i >= nr)
>> break;
>> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page
>> *page, struct list_head *list,
>>
>> if (nr_dropped)
>> shmem_uncharge(folio->mapping->host, nr_dropped);
>> - remap_page(folio, nr);
>> + remap_page(folio, nr, PageAnon(head) ?
>> RMP_USE_SHARED_ZEROPAGE : 0);
>>
>> /*
>> * set page to its compound_head when split to non order-0
>> pages, so
>> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct
>> page *page, struct list_head *list,
>> if (mapping)
>> xas_unlock(&xas);
>> local_irq_enable();
>> - remap_page(folio, folio_nr_pages(folio));
>> + remap_page(folio, folio_nr_pages(folio), 0);
>> ret = -EAGAIN;
>> }
>>
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 6f9c62c746be..d039863e014b 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio,
>> struct list_head *list)
>> return true;
>> }
>>
>> +static bool try_to_map_unused_to_zeropage(struct
>> page_vma_mapped_walk *pvmw,
>> + struct folio *folio,
>> + unsigned long idx)
>> +{
>> + struct page *page = folio_page(folio, idx);
>> + bool contains_data;
>> + pte_t newpte;
>> + void *addr;
>> +
>> + VM_BUG_ON_PAGE(PageCompound(page), page);
>> + VM_BUG_ON_PAGE(!PageAnon(page), page);
>> + VM_BUG_ON_PAGE(!PageLocked(page), page);
>> + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>> +
>> + if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags &
>> VM_LOCKED) ||
>> + mm_forbids_zeropage(pvmw->vma->vm_mm))
>> + return false;
>> +
>> + /*
>> + * The pmd entry mapping the old thp was flushed and the pte
>> mapping
>> + * this subpage has been non present. If the subpage is only
>> zero-filled
>> + * then map it to the shared zeropage.
>> + */
>> + addr = kmap_local_page(page);
>> + contains_data = memchr_inv(addr, 0, PAGE_SIZE);
>> + kunmap_local(addr);
>> +
>> + if (contains_data)
>> + return false;
>> +
>> + newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
>> + pvmw->vma->vm_page_prot));
>> + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte,
>> newpte);
>> +
>> + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
>> + return true;
>> +}
>> +
>> +struct rmap_walk_arg {
>> + struct folio *folio;
>> + bool map_unused_to_zeropage;
>> +};
>> +
>> /*
>> * Restore a potential migration pte to a working pte entry
>> */
>> static bool remove_migration_pte(struct folio *folio,
>> - struct vm_area_struct *vma, unsigned long addr, void
>> *old)
>> + struct vm_area_struct *vma, unsigned long addr, void
>> *arg)
>> {
>> - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC |
>> PVMW_MIGRATION);
>> + struct rmap_walk_arg *rmap_walk_arg = arg;
>> + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr,
>> PVMW_SYNC | PVMW_MIGRATION);
>>
>> while (page_vma_mapped_walk(&pvmw)) {
>> rmap_t rmap_flags = RMAP_NONE;
>> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio
>> *folio,
>> continue;
>> }
>> #endif
>> + if (rmap_walk_arg->map_unused_to_zeropage &&
>> + try_to_map_unused_to_zeropage(&pvmw, folio,
>> idx))
>> + continue;
>>
>> folio_get(folio);
>> pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
>> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio
>> *folio,
>> * Get rid of all migration entries and replace them by
>> * references to the indicated page.
>> */
>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>> bool locked)
>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>> flags)
>> {
>> + struct rmap_walk_arg rmap_walk_arg = {
>> + .folio = src,
>> + .map_unused_to_zeropage = flags &
>> RMP_USE_SHARED_ZEROPAGE,
>> + };
>> +
>> struct rmap_walk_control rwc = {
>> .rmap_one = remove_migration_pte,
>> - .arg = src,
>> + .arg = &rmap_walk_arg,
>> };
>>
>> - if (locked)
>> + VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src !=
>> dst), src);
>> +
>> + if (flags & RMP_LOCKED)
>> rmap_walk_locked(dst, &rwc);
>> else
>> rmap_walk(dst, &rwc);
>> @@ -934,7 +988,7 @@ static int writeout(struct address_space
>> *mapping, struct folio *folio)
>> * At this point we know that the migration attempt cannot
>> * be successful.
>> */
>> - remove_migration_ptes(folio, folio, false);
>> + remove_migration_ptes(folio, folio, 0);
>>
>> rc = mapping->a_ops->writepage(&folio->page, &wbc);
>>
>> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio
>> *src,
>> struct list_head *ret)
>> {
>> if (page_was_mapped)
>> - remove_migration_ptes(src, src, false);
>> + remove_migration_ptes(src, src, 0);
>> /* Drop an anon_vma reference if we took one */
>> if (anon_vma)
>> put_anon_vma(anon_vma);
>> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t
>> put_new_folio, unsigned long private,
>> lru_add_drain();
>>
>> if (old_page_state & PAGE_WAS_MAPPED)
>> - remove_migration_ptes(src, dst, false);
>> + remove_migration_ptes(src, dst, 0);
>>
>> out_unlock_both:
>> folio_unlock(dst);
>> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t
>> get_new_folio,
>>
>> if (page_was_mapped)
>> remove_migration_ptes(src,
>> - rc == MIGRATEPAGE_SUCCESS ? dst : src,
>> false);
>> + rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>>
>> unlock_put_anon:
>> folio_unlock(dst);
>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> index 8d687de88a03..9cf26592ac93 100644
>> --- a/mm/migrate_device.c
>> +++ b/mm/migrate_device.c
>> @@ -424,7 +424,7 @@ static unsigned long
>> migrate_device_unmap(unsigned long *src_pfns,
>> continue;
>>
>> folio = page_folio(page);
>> - remove_migration_ptes(folio, folio, false);
>> + remove_migration_ptes(folio, folio, 0);
>>
>> src_pfns[i] = 0;
>> folio_unlock(folio);
>> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long
>> *src_pfns,
>> dst = src;
>> }
>>
>> - remove_migration_ptes(src, dst, false);
>> + remove_migration_ptes(src, dst, 0);
>> folio_unlock(src);
>>
>> if (folio_is_zone_device(src))
>
> Hi,
>
> This patch has been in the mainline for some time, but we recently
> discovered an issue when both mTHP and MTE (Memory Tagging Extension)
> are enabled.
>
> It seems that remapping to the same zeropage might causes MTE tag
> mismatches, since MTE tags are associated with physical addresses.
Does this only trigger when the VMA has mte enabled? Maybe we'll have to
bail out if we detect that mte is enabled.
Also, I wonder how KSM and the shared zeropage works in general with
that, because I would expect similar issues when we de-duplicate memory?
--
Cheers
David / dhildenb
On Thu, Sep 18, 2025 at 5:21 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 18.09.25 10:53, Qun-wei Lin (林群崴) wrote:
> > On Fri, 2024-08-30 at 11:03 +0100, Usama Arif wrote:
> >> From: Yu Zhao <yuzhao@google.com>
> >>
> >> Here being unused means containing only zeros and inaccessible to
> >> userspace. When splitting an isolated thp under reclaim or migration,
> >> the unused subpages can be mapped to the shared zeropage, hence
> >> saving
> >> memory. This is particularly helpful when the internal
> >> fragmentation of a thp is high, i.e. it has many untouched subpages.
> >>
> >> This is also a prerequisite for THP low utilization shrinker which
> >> will
> >> be introduced in later patches, where underutilized THPs are split,
> >> and
> >> the zero-filled pages are freed saving memory.
> >>
> >> Signed-off-by: Yu Zhao <yuzhao@google.com>
> >> Tested-by: Shuang Zhai <zhais@google.com>
> >> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >> ---
> >> include/linux/rmap.h | 7 ++++-
> >> mm/huge_memory.c | 8 ++---
> >> mm/migrate.c | 72 ++++++++++++++++++++++++++++++++++++++----
> >> --
> >> mm/migrate_device.c | 4 +--
> >> 4 files changed, 75 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> >> index 91b5935e8485..d5e93e44322e 100644
> >> --- a/include/linux/rmap.h
> >> +++ b/include/linux/rmap.h
> >> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
> >> int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages,
> >> pgoff_t pgoff,
> >> struct vm_area_struct *vma);
> >>
> >> -void remove_migration_ptes(struct folio *src, struct folio *dst,
> >> bool locked);
> >> +enum rmp_flags {
> >> + RMP_LOCKED = 1 << 0,
> >> + RMP_USE_SHARED_ZEROPAGE = 1 << 1,
> >> +};
> >> +
> >> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
> >> flags);
> >>
> >> /*
> >> * rmap_walk_control: To control rmap traversing for specific needs
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 0c48806ccb9a..af60684e7c70 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct
> >> vm_area_struct *vma, unsigned long addr,
> >> return false;
> >> }
> >>
> >> -static void remap_page(struct folio *folio, unsigned long nr)
> >> +static void remap_page(struct folio *folio, unsigned long nr, int
> >> flags)
> >> {
> >> int i = 0;
> >>
> >> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio,
> >> unsigned long nr)
> >> if (!folio_test_anon(folio))
> >> return;
> >> for (;;) {
> >> - remove_migration_ptes(folio, folio, true);
> >> + remove_migration_ptes(folio, folio, RMP_LOCKED |
> >> flags);
> >> i += folio_nr_pages(folio);
> >> if (i >= nr)
> >> break;
> >> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page
> >> *page, struct list_head *list,
> >>
> >> if (nr_dropped)
> >> shmem_uncharge(folio->mapping->host, nr_dropped);
> >> - remap_page(folio, nr);
> >> + remap_page(folio, nr, PageAnon(head) ?
> >> RMP_USE_SHARED_ZEROPAGE : 0);
> >>
> >> /*
> >> * set page to its compound_head when split to non order-0
> >> pages, so
> >> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct
> >> page *page, struct list_head *list,
> >> if (mapping)
> >> xas_unlock(&xas);
> >> local_irq_enable();
> >> - remap_page(folio, folio_nr_pages(folio));
> >> + remap_page(folio, folio_nr_pages(folio), 0);
> >> ret = -EAGAIN;
> >> }
> >>
> >> diff --git a/mm/migrate.c b/mm/migrate.c
> >> index 6f9c62c746be..d039863e014b 100644
> >> --- a/mm/migrate.c
> >> +++ b/mm/migrate.c
> >> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio,
> >> struct list_head *list)
> >> return true;
> >> }
> >>
> >> +static bool try_to_map_unused_to_zeropage(struct
> >> page_vma_mapped_walk *pvmw,
> >> + struct folio *folio,
> >> + unsigned long idx)
> >> +{
> >> + struct page *page = folio_page(folio, idx);
> >> + bool contains_data;
> >> + pte_t newpte;
> >> + void *addr;
> >> +
> >> + VM_BUG_ON_PAGE(PageCompound(page), page);
> >> + VM_BUG_ON_PAGE(!PageAnon(page), page);
> >> + VM_BUG_ON_PAGE(!PageLocked(page), page);
> >> + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
> >> +
> >> + if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags &
> >> VM_LOCKED) ||
> >> + mm_forbids_zeropage(pvmw->vma->vm_mm))
> >> + return false;
> >> +
> >> + /*
> >> + * The pmd entry mapping the old thp was flushed and the pte
> >> mapping
> >> + * this subpage has been non present. If the subpage is only
> >> zero-filled
> >> + * then map it to the shared zeropage.
> >> + */
> >> + addr = kmap_local_page(page);
> >> + contains_data = memchr_inv(addr, 0, PAGE_SIZE);
> >> + kunmap_local(addr);
> >> +
> >> + if (contains_data)
> >> + return false;
> >> +
> >> + newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
> >> + pvmw->vma->vm_page_prot));
> >> + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte,
> >> newpte);
> >> +
> >> + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
> >> + return true;
> >> +}
> >> +
> >> +struct rmap_walk_arg {
> >> + struct folio *folio;
> >> + bool map_unused_to_zeropage;
> >> +};
> >> +
> >> /*
> >> * Restore a potential migration pte to a working pte entry
> >> */
> >> static bool remove_migration_pte(struct folio *folio,
> >> - struct vm_area_struct *vma, unsigned long addr, void
> >> *old)
> >> + struct vm_area_struct *vma, unsigned long addr, void
> >> *arg)
> >> {
> >> - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC |
> >> PVMW_MIGRATION);
> >> + struct rmap_walk_arg *rmap_walk_arg = arg;
> >> + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr,
> >> PVMW_SYNC | PVMW_MIGRATION);
> >>
> >> while (page_vma_mapped_walk(&pvmw)) {
> >> rmap_t rmap_flags = RMAP_NONE;
> >> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio
> >> *folio,
> >> continue;
> >> }
> >> #endif
> >> + if (rmap_walk_arg->map_unused_to_zeropage &&
> >> + try_to_map_unused_to_zeropage(&pvmw, folio,
> >> idx))
> >> + continue;
> >>
> >> folio_get(folio);
> >> pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
> >> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio
> >> *folio,
> >> * Get rid of all migration entries and replace them by
> >> * references to the indicated page.
> >> */
> >> -void remove_migration_ptes(struct folio *src, struct folio *dst,
> >> bool locked)
> >> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
> >> flags)
> >> {
> >> + struct rmap_walk_arg rmap_walk_arg = {
> >> + .folio = src,
> >> + .map_unused_to_zeropage = flags &
> >> RMP_USE_SHARED_ZEROPAGE,
> >> + };
> >> +
> >> struct rmap_walk_control rwc = {
> >> .rmap_one = remove_migration_pte,
> >> - .arg = src,
> >> + .arg = &rmap_walk_arg,
> >> };
> >>
> >> - if (locked)
> >> + VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src !=
> >> dst), src);
> >> +
> >> + if (flags & RMP_LOCKED)
> >> rmap_walk_locked(dst, &rwc);
> >> else
> >> rmap_walk(dst, &rwc);
> >> @@ -934,7 +988,7 @@ static int writeout(struct address_space
> >> *mapping, struct folio *folio)
> >> * At this point we know that the migration attempt cannot
> >> * be successful.
> >> */
> >> - remove_migration_ptes(folio, folio, false);
> >> + remove_migration_ptes(folio, folio, 0);
> >>
> >> rc = mapping->a_ops->writepage(&folio->page, &wbc);
> >>
> >> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio
> >> *src,
> >> struct list_head *ret)
> >> {
> >> if (page_was_mapped)
> >> - remove_migration_ptes(src, src, false);
> >> + remove_migration_ptes(src, src, 0);
> >> /* Drop an anon_vma reference if we took one */
> >> if (anon_vma)
> >> put_anon_vma(anon_vma);
> >> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t
> >> put_new_folio, unsigned long private,
> >> lru_add_drain();
> >>
> >> if (old_page_state & PAGE_WAS_MAPPED)
> >> - remove_migration_ptes(src, dst, false);
> >> + remove_migration_ptes(src, dst, 0);
> >>
> >> out_unlock_both:
> >> folio_unlock(dst);
> >> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t
> >> get_new_folio,
> >>
> >> if (page_was_mapped)
> >> remove_migration_ptes(src,
> >> - rc == MIGRATEPAGE_SUCCESS ? dst : src,
> >> false);
> >> + rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
> >>
> >> unlock_put_anon:
> >> folio_unlock(dst);
> >> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> >> index 8d687de88a03..9cf26592ac93 100644
> >> --- a/mm/migrate_device.c
> >> +++ b/mm/migrate_device.c
> >> @@ -424,7 +424,7 @@ static unsigned long
> >> migrate_device_unmap(unsigned long *src_pfns,
> >> continue;
> >>
> >> folio = page_folio(page);
> >> - remove_migration_ptes(folio, folio, false);
> >> + remove_migration_ptes(folio, folio, 0);
> >>
> >> src_pfns[i] = 0;
> >> folio_unlock(folio);
> >> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long
> >> *src_pfns,
> >> dst = src;
> >> }
> >>
> >> - remove_migration_ptes(src, dst, false);
> >> + remove_migration_ptes(src, dst, 0);
> >> folio_unlock(src);
> >>
> >> if (folio_is_zone_device(src))
> >
> > Hi,
> >
> > This patch has been in the mainline for some time, but we recently
> > discovered an issue when both mTHP and MTE (Memory Tagging Extension)
> > are enabled.
> >
> > It seems that remapping to the same zeropage might causes MTE tag
> > mismatches, since MTE tags are associated with physical addresses.
>
> Does this only trigger when the VMA has mte enabled? Maybe we'll have to
> bail out if we detect that mte is enabled.
It seems RISC-V also has a similar feature (RISCV_ISA_SUPM) that uses
the same prctl(PR_{GET,SET}_TAGGED_ADDR_CTRL) API.
config RISCV_ISA_SUPM
bool "Supm extension for userspace pointer masking"
depends on 64BIT
default y
help
Add support for pointer masking in userspace (Supm) when the
underlying hardware extension (Smnpm or Ssnpm) is detected at boot.
If this option is disabled, userspace will be unable to use
the prctl(PR_{SET,GET}_TAGGED_ADDR_CTRL) API.
I wonder if we should disable the THP shrinker for such architectures that
define PR_SET_TAGGED_ADDR_CTRL (or PR_GET_TAGGED_ADDR_CTRL).
Cheers,
Lance
>
> Also, I wonder how KSM and the shared zeropage works in general with
> that, because I would expect similar issues when we de-duplicate memory?
>
> --
> Cheers
>
> David / dhildenb
>
>
On 18.09.25 14:22, Lance Yang wrote:
> On Thu, Sep 18, 2025 at 5:21 PM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 18.09.25 10:53, Qun-wei Lin (林群崴) wrote:
>>> On Fri, 2024-08-30 at 11:03 +0100, Usama Arif wrote:
>>>> From: Yu Zhao <yuzhao@google.com>
>>>>
>>>> Here being unused means containing only zeros and inaccessible to
>>>> userspace. When splitting an isolated thp under reclaim or migration,
>>>> the unused subpages can be mapped to the shared zeropage, hence
>>>> saving
>>>> memory. This is particularly helpful when the internal
>>>> fragmentation of a thp is high, i.e. it has many untouched subpages.
>>>>
>>>> This is also a prerequisite for THP low utilization shrinker which
>>>> will
>>>> be introduced in later patches, where underutilized THPs are split,
>>>> and
>>>> the zero-filled pages are freed saving memory.
>>>>
>>>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>>>> Tested-by: Shuang Zhai <zhais@google.com>
>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>> ---
>>>> include/linux/rmap.h | 7 ++++-
>>>> mm/huge_memory.c | 8 ++---
>>>> mm/migrate.c | 72 ++++++++++++++++++++++++++++++++++++++----
>>>> --
>>>> mm/migrate_device.c | 4 +--
>>>> 4 files changed, 75 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>>> index 91b5935e8485..d5e93e44322e 100644
>>>> --- a/include/linux/rmap.h
>>>> +++ b/include/linux/rmap.h
>>>> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>>>> int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages,
>>>> pgoff_t pgoff,
>>>> struct vm_area_struct *vma);
>>>>
>>>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>>>> bool locked);
>>>> +enum rmp_flags {
>>>> + RMP_LOCKED = 1 << 0,
>>>> + RMP_USE_SHARED_ZEROPAGE = 1 << 1,
>>>> +};
>>>> +
>>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>>>> flags);
>>>>
>>>> /*
>>>> * rmap_walk_control: To control rmap traversing for specific needs
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 0c48806ccb9a..af60684e7c70 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct
>>>> vm_area_struct *vma, unsigned long addr,
>>>> return false;
>>>> }
>>>>
>>>> -static void remap_page(struct folio *folio, unsigned long nr)
>>>> +static void remap_page(struct folio *folio, unsigned long nr, int
>>>> flags)
>>>> {
>>>> int i = 0;
>>>>
>>>> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio,
>>>> unsigned long nr)
>>>> if (!folio_test_anon(folio))
>>>> return;
>>>> for (;;) {
>>>> - remove_migration_ptes(folio, folio, true);
>>>> + remove_migration_ptes(folio, folio, RMP_LOCKED |
>>>> flags);
>>>> i += folio_nr_pages(folio);
>>>> if (i >= nr)
>>>> break;
>>>> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page
>>>> *page, struct list_head *list,
>>>>
>>>> if (nr_dropped)
>>>> shmem_uncharge(folio->mapping->host, nr_dropped);
>>>> - remap_page(folio, nr);
>>>> + remap_page(folio, nr, PageAnon(head) ?
>>>> RMP_USE_SHARED_ZEROPAGE : 0);
>>>>
>>>> /*
>>>> * set page to its compound_head when split to non order-0
>>>> pages, so
>>>> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct
>>>> page *page, struct list_head *list,
>>>> if (mapping)
>>>> xas_unlock(&xas);
>>>> local_irq_enable();
>>>> - remap_page(folio, folio_nr_pages(folio));
>>>> + remap_page(folio, folio_nr_pages(folio), 0);
>>>> ret = -EAGAIN;
>>>> }
>>>>
>>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>>> index 6f9c62c746be..d039863e014b 100644
>>>> --- a/mm/migrate.c
>>>> +++ b/mm/migrate.c
>>>> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio,
>>>> struct list_head *list)
>>>> return true;
>>>> }
>>>>
>>>> +static bool try_to_map_unused_to_zeropage(struct
>>>> page_vma_mapped_walk *pvmw,
>>>> + struct folio *folio,
>>>> + unsigned long idx)
>>>> +{
>>>> + struct page *page = folio_page(folio, idx);
>>>> + bool contains_data;
>>>> + pte_t newpte;
>>>> + void *addr;
>>>> +
>>>> + VM_BUG_ON_PAGE(PageCompound(page), page);
>>>> + VM_BUG_ON_PAGE(!PageAnon(page), page);
>>>> + VM_BUG_ON_PAGE(!PageLocked(page), page);
>>>> + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>>>> +
>>>> + if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags &
>>>> VM_LOCKED) ||
>>>> + mm_forbids_zeropage(pvmw->vma->vm_mm))
>>>> + return false;
>>>> +
>>>> + /*
>>>> + * The pmd entry mapping the old thp was flushed and the pte
>>>> mapping
>>>> + * this subpage has been non present. If the subpage is only
>>>> zero-filled
>>>> + * then map it to the shared zeropage.
>>>> + */
>>>> + addr = kmap_local_page(page);
>>>> + contains_data = memchr_inv(addr, 0, PAGE_SIZE);
>>>> + kunmap_local(addr);
>>>> +
>>>> + if (contains_data)
>>>> + return false;
>>>> +
>>>> + newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
>>>> + pvmw->vma->vm_page_prot));
>>>> + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte,
>>>> newpte);
>>>> +
>>>> + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
>>>> + return true;
>>>> +}
>>>> +
>>>> +struct rmap_walk_arg {
>>>> + struct folio *folio;
>>>> + bool map_unused_to_zeropage;
>>>> +};
>>>> +
>>>> /*
>>>> * Restore a potential migration pte to a working pte entry
>>>> */
>>>> static bool remove_migration_pte(struct folio *folio,
>>>> - struct vm_area_struct *vma, unsigned long addr, void
>>>> *old)
>>>> + struct vm_area_struct *vma, unsigned long addr, void
>>>> *arg)
>>>> {
>>>> - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC |
>>>> PVMW_MIGRATION);
>>>> + struct rmap_walk_arg *rmap_walk_arg = arg;
>>>> + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr,
>>>> PVMW_SYNC | PVMW_MIGRATION);
>>>>
>>>> while (page_vma_mapped_walk(&pvmw)) {
>>>> rmap_t rmap_flags = RMAP_NONE;
>>>> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio
>>>> *folio,
>>>> continue;
>>>> }
>>>> #endif
>>>> + if (rmap_walk_arg->map_unused_to_zeropage &&
>>>> + try_to_map_unused_to_zeropage(&pvmw, folio,
>>>> idx))
>>>> + continue;
>>>>
>>>> folio_get(folio);
>>>> pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
>>>> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio
>>>> *folio,
>>>> * Get rid of all migration entries and replace them by
>>>> * references to the indicated page.
>>>> */
>>>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>>>> bool locked)
>>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>>>> flags)
>>>> {
>>>> + struct rmap_walk_arg rmap_walk_arg = {
>>>> + .folio = src,
>>>> + .map_unused_to_zeropage = flags &
>>>> RMP_USE_SHARED_ZEROPAGE,
>>>> + };
>>>> +
>>>> struct rmap_walk_control rwc = {
>>>> .rmap_one = remove_migration_pte,
>>>> - .arg = src,
>>>> + .arg = &rmap_walk_arg,
>>>> };
>>>>
>>>> - if (locked)
>>>> + VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src !=
>>>> dst), src);
>>>> +
>>>> + if (flags & RMP_LOCKED)
>>>> rmap_walk_locked(dst, &rwc);
>>>> else
>>>> rmap_walk(dst, &rwc);
>>>> @@ -934,7 +988,7 @@ static int writeout(struct address_space
>>>> *mapping, struct folio *folio)
>>>> * At this point we know that the migration attempt cannot
>>>> * be successful.
>>>> */
>>>> - remove_migration_ptes(folio, folio, false);
>>>> + remove_migration_ptes(folio, folio, 0);
>>>>
>>>> rc = mapping->a_ops->writepage(&folio->page, &wbc);
>>>>
>>>> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio
>>>> *src,
>>>> struct list_head *ret)
>>>> {
>>>> if (page_was_mapped)
>>>> - remove_migration_ptes(src, src, false);
>>>> + remove_migration_ptes(src, src, 0);
>>>> /* Drop an anon_vma reference if we took one */
>>>> if (anon_vma)
>>>> put_anon_vma(anon_vma);
>>>> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t
>>>> put_new_folio, unsigned long private,
>>>> lru_add_drain();
>>>>
>>>> if (old_page_state & PAGE_WAS_MAPPED)
>>>> - remove_migration_ptes(src, dst, false);
>>>> + remove_migration_ptes(src, dst, 0);
>>>>
>>>> out_unlock_both:
>>>> folio_unlock(dst);
>>>> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t
>>>> get_new_folio,
>>>>
>>>> if (page_was_mapped)
>>>> remove_migration_ptes(src,
>>>> - rc == MIGRATEPAGE_SUCCESS ? dst : src,
>>>> false);
>>>> + rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>>>>
>>>> unlock_put_anon:
>>>> folio_unlock(dst);
>>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>>> index 8d687de88a03..9cf26592ac93 100644
>>>> --- a/mm/migrate_device.c
>>>> +++ b/mm/migrate_device.c
>>>> @@ -424,7 +424,7 @@ static unsigned long
>>>> migrate_device_unmap(unsigned long *src_pfns,
>>>> continue;
>>>>
>>>> folio = page_folio(page);
>>>> - remove_migration_ptes(folio, folio, false);
>>>> + remove_migration_ptes(folio, folio, 0);
>>>>
>>>> src_pfns[i] = 0;
>>>> folio_unlock(folio);
>>>> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long
>>>> *src_pfns,
>>>> dst = src;
>>>> }
>>>>
>>>> - remove_migration_ptes(src, dst, false);
>>>> + remove_migration_ptes(src, dst, 0);
>>>> folio_unlock(src);
>>>>
>>>> if (folio_is_zone_device(src))
>>>
>>> Hi,
>>>
>>> This patch has been in the mainline for some time, but we recently
>>> discovered an issue when both mTHP and MTE (Memory Tagging Extension)
>>> are enabled.
>>>
>>> It seems that remapping to the same zeropage might causes MTE tag
>>> mismatches, since MTE tags are associated with physical addresses.
>>
>> Does this only trigger when the VMA has mte enabled? Maybe we'll have to
>> bail out if we detect that mte is enabled.
>
> It seems RISC-V also has a similar feature (RISCV_ISA_SUPM) that uses
> the same prctl(PR_{GET,SET}_TAGGED_ADDR_CTRL) API.
>
> config RISCV_ISA_SUPM
> bool "Supm extension for userspace pointer masking"
> depends on 64BIT
> default y
> help
> Add support for pointer masking in userspace (Supm) when the
> underlying hardware extension (Smnpm or Ssnpm) is detected at boot.
>
> If this option is disabled, userspace will be unable to use
> the prctl(PR_{SET,GET}_TAGGED_ADDR_CTRL) API.
>
> I wonder if we should disable the THP shrinker for such architectures that
I think where possible we really only want to identify problematic
(tagged) pages and skip them. And we should either look into fixing KSM
as well or finding out why KSM is not affected.
--
Cheers
David / dhildenb
On 2025/9/18 20:35, David Hildenbrand wrote:
> On 18.09.25 14:22, Lance Yang wrote:
>> On Thu, Sep 18, 2025 at 5:21 PM David Hildenbrand <david@redhat.com>
>> wrote:
>>>
>>> On 18.09.25 10:53, Qun-wei Lin (林群崴) wrote:
>>>> On Fri, 2024-08-30 at 11:03 +0100, Usama Arif wrote:
>>>>> From: Yu Zhao <yuzhao@google.com>
>>>>>
>>>>> Here being unused means containing only zeros and inaccessible to
>>>>> userspace. When splitting an isolated thp under reclaim or migration,
>>>>> the unused subpages can be mapped to the shared zeropage, hence
>>>>> saving
>>>>> memory. This is particularly helpful when the internal
>>>>> fragmentation of a thp is high, i.e. it has many untouched subpages.
>>>>>
>>>>> This is also a prerequisite for THP low utilization shrinker which
>>>>> will
>>>>> be introduced in later patches, where underutilized THPs are split,
>>>>> and
>>>>> the zero-filled pages are freed saving memory.
>>>>>
>>>>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>>>>> Tested-by: Shuang Zhai <zhais@google.com>
>>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>>> ---
>>>>> include/linux/rmap.h | 7 ++++-
>>>>> mm/huge_memory.c | 8 ++---
>>>>> mm/migrate.c | 72 +++++++++++++++++++++++++++++++++++++
>>>>> +----
>>>>> --
>>>>> mm/migrate_device.c | 4 +--
>>>>> 4 files changed, 75 insertions(+), 16 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>>>> index 91b5935e8485..d5e93e44322e 100644
>>>>> --- a/include/linux/rmap.h
>>>>> +++ b/include/linux/rmap.h
>>>>> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>>>>> int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages,
>>>>> pgoff_t pgoff,
>>>>> struct vm_area_struct *vma);
>>>>>
>>>>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>>>>> bool locked);
>>>>> +enum rmp_flags {
>>>>> + RMP_LOCKED = 1 << 0,
>>>>> + RMP_USE_SHARED_ZEROPAGE = 1 << 1,
>>>>> +};
>>>>> +
>>>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>>>>> flags);
>>>>>
>>>>> /*
>>>>> * rmap_walk_control: To control rmap traversing for specific needs
>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>> index 0c48806ccb9a..af60684e7c70 100644
>>>>> --- a/mm/huge_memory.c
>>>>> +++ b/mm/huge_memory.c
>>>>> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct
>>>>> vm_area_struct *vma, unsigned long addr,
>>>>> return false;
>>>>> }
>>>>>
>>>>> -static void remap_page(struct folio *folio, unsigned long nr)
>>>>> +static void remap_page(struct folio *folio, unsigned long nr, int
>>>>> flags)
>>>>> {
>>>>> int i = 0;
>>>>>
>>>>> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio,
>>>>> unsigned long nr)
>>>>> if (!folio_test_anon(folio))
>>>>> return;
>>>>> for (;;) {
>>>>> - remove_migration_ptes(folio, folio, true);
>>>>> + remove_migration_ptes(folio, folio, RMP_LOCKED |
>>>>> flags);
>>>>> i += folio_nr_pages(folio);
>>>>> if (i >= nr)
>>>>> break;
>>>>> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page
>>>>> *page, struct list_head *list,
>>>>>
>>>>> if (nr_dropped)
>>>>> shmem_uncharge(folio->mapping->host, nr_dropped);
>>>>> - remap_page(folio, nr);
>>>>> + remap_page(folio, nr, PageAnon(head) ?
>>>>> RMP_USE_SHARED_ZEROPAGE : 0);
>>>>>
>>>>> /*
>>>>> * set page to its compound_head when split to non order-0
>>>>> pages, so
>>>>> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct
>>>>> page *page, struct list_head *list,
>>>>> if (mapping)
>>>>> xas_unlock(&xas);
>>>>> local_irq_enable();
>>>>> - remap_page(folio, folio_nr_pages(folio));
>>>>> + remap_page(folio, folio_nr_pages(folio), 0);
>>>>> ret = -EAGAIN;
>>>>> }
>>>>>
>>>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>>>> index 6f9c62c746be..d039863e014b 100644
>>>>> --- a/mm/migrate.c
>>>>> +++ b/mm/migrate.c
>>>>> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio,
>>>>> struct list_head *list)
>>>>> return true;
>>>>> }
>>>>>
>>>>> +static bool try_to_map_unused_to_zeropage(struct
>>>>> page_vma_mapped_walk *pvmw,
>>>>> + struct folio *folio,
>>>>> + unsigned long idx)
>>>>> +{
>>>>> + struct page *page = folio_page(folio, idx);
>>>>> + bool contains_data;
>>>>> + pte_t newpte;
>>>>> + void *addr;
>>>>> +
>>>>> + VM_BUG_ON_PAGE(PageCompound(page), page);
>>>>> + VM_BUG_ON_PAGE(!PageAnon(page), page);
>>>>> + VM_BUG_ON_PAGE(!PageLocked(page), page);
>>>>> + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>>>>> +
>>>>> + if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags &
>>>>> VM_LOCKED) ||
>>>>> + mm_forbids_zeropage(pvmw->vma->vm_mm))
>>>>> + return false;
>>>>> +
>>>>> + /*
>>>>> + * The pmd entry mapping the old thp was flushed and the pte
>>>>> mapping
>>>>> + * this subpage has been non present. If the subpage is only
>>>>> zero-filled
>>>>> + * then map it to the shared zeropage.
>>>>> + */
>>>>> + addr = kmap_local_page(page);
>>>>> + contains_data = memchr_inv(addr, 0, PAGE_SIZE);
>>>>> + kunmap_local(addr);
>>>>> +
>>>>> + if (contains_data)
>>>>> + return false;
>>>>> +
>>>>> + newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
>>>>> + pvmw->vma->vm_page_prot));
>>>>> + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte,
>>>>> newpte);
>>>>> +
>>>>> + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
>>>>> + return true;
>>>>> +}
>>>>> +
>>>>> +struct rmap_walk_arg {
>>>>> + struct folio *folio;
>>>>> + bool map_unused_to_zeropage;
>>>>> +};
>>>>> +
>>>>> /*
>>>>> * Restore a potential migration pte to a working pte entry
>>>>> */
>>>>> static bool remove_migration_pte(struct folio *folio,
>>>>> - struct vm_area_struct *vma, unsigned long addr, void
>>>>> *old)
>>>>> + struct vm_area_struct *vma, unsigned long addr, void
>>>>> *arg)
>>>>> {
>>>>> - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC |
>>>>> PVMW_MIGRATION);
>>>>> + struct rmap_walk_arg *rmap_walk_arg = arg;
>>>>> + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr,
>>>>> PVMW_SYNC | PVMW_MIGRATION);
>>>>>
>>>>> while (page_vma_mapped_walk(&pvmw)) {
>>>>> rmap_t rmap_flags = RMAP_NONE;
>>>>> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio
>>>>> *folio,
>>>>> continue;
>>>>> }
>>>>> #endif
>>>>> + if (rmap_walk_arg->map_unused_to_zeropage &&
>>>>> + try_to_map_unused_to_zeropage(&pvmw, folio,
>>>>> idx))
>>>>> + continue;
>>>>>
>>>>> folio_get(folio);
>>>>> pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
>>>>> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio
>>>>> *folio,
>>>>> * Get rid of all migration entries and replace them by
>>>>> * references to the indicated page.
>>>>> */
>>>>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>>>>> bool locked)
>>>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>>>>> flags)
>>>>> {
>>>>> + struct rmap_walk_arg rmap_walk_arg = {
>>>>> + .folio = src,
>>>>> + .map_unused_to_zeropage = flags &
>>>>> RMP_USE_SHARED_ZEROPAGE,
>>>>> + };
>>>>> +
>>>>> struct rmap_walk_control rwc = {
>>>>> .rmap_one = remove_migration_pte,
>>>>> - .arg = src,
>>>>> + .arg = &rmap_walk_arg,
>>>>> };
>>>>>
>>>>> - if (locked)
>>>>> + VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src !=
>>>>> dst), src);
>>>>> +
>>>>> + if (flags & RMP_LOCKED)
>>>>> rmap_walk_locked(dst, &rwc);
>>>>> else
>>>>> rmap_walk(dst, &rwc);
>>>>> @@ -934,7 +988,7 @@ static int writeout(struct address_space
>>>>> *mapping, struct folio *folio)
>>>>> * At this point we know that the migration attempt cannot
>>>>> * be successful.
>>>>> */
>>>>> - remove_migration_ptes(folio, folio, false);
>>>>> + remove_migration_ptes(folio, folio, 0);
>>>>>
>>>>> rc = mapping->a_ops->writepage(&folio->page, &wbc);
>>>>>
>>>>> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio
>>>>> *src,
>>>>> struct list_head *ret)
>>>>> {
>>>>> if (page_was_mapped)
>>>>> - remove_migration_ptes(src, src, false);
>>>>> + remove_migration_ptes(src, src, 0);
>>>>> /* Drop an anon_vma reference if we took one */
>>>>> if (anon_vma)
>>>>> put_anon_vma(anon_vma);
>>>>> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t
>>>>> put_new_folio, unsigned long private,
>>>>> lru_add_drain();
>>>>>
>>>>> if (old_page_state & PAGE_WAS_MAPPED)
>>>>> - remove_migration_ptes(src, dst, false);
>>>>> + remove_migration_ptes(src, dst, 0);
>>>>>
>>>>> out_unlock_both:
>>>>> folio_unlock(dst);
>>>>> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t
>>>>> get_new_folio,
>>>>>
>>>>> if (page_was_mapped)
>>>>> remove_migration_ptes(src,
>>>>> - rc == MIGRATEPAGE_SUCCESS ? dst : src,
>>>>> false);
>>>>> + rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>>>>>
>>>>> unlock_put_anon:
>>>>> folio_unlock(dst);
>>>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>>>> index 8d687de88a03..9cf26592ac93 100644
>>>>> --- a/mm/migrate_device.c
>>>>> +++ b/mm/migrate_device.c
>>>>> @@ -424,7 +424,7 @@ static unsigned long
>>>>> migrate_device_unmap(unsigned long *src_pfns,
>>>>> continue;
>>>>>
>>>>> folio = page_folio(page);
>>>>> - remove_migration_ptes(folio, folio, false);
>>>>> + remove_migration_ptes(folio, folio, 0);
>>>>>
>>>>> src_pfns[i] = 0;
>>>>> folio_unlock(folio);
>>>>> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long
>>>>> *src_pfns,
>>>>> dst = src;
>>>>> }
>>>>>
>>>>> - remove_migration_ptes(src, dst, false);
>>>>> + remove_migration_ptes(src, dst, 0);
>>>>> folio_unlock(src);
>>>>>
>>>>> if (folio_is_zone_device(src))
>>>>
>>>> Hi,
>>>>
>>>> This patch has been in the mainline for some time, but we recently
>>>> discovered an issue when both mTHP and MTE (Memory Tagging Extension)
>>>> are enabled.
>>>>
>>>> It seems that remapping to the same zeropage might causes MTE tag
>>>> mismatches, since MTE tags are associated with physical addresses.
>>>
>>> Does this only trigger when the VMA has mte enabled? Maybe we'll have to
>>> bail out if we detect that mte is enabled.
>>
>> It seems RISC-V also has a similar feature (RISCV_ISA_SUPM) that uses
>> the same prctl(PR_{GET,SET}_TAGGED_ADDR_CTRL) API.
>>
>> config RISCV_ISA_SUPM
>> bool "Supm extension for userspace pointer masking"
>> depends on 64BIT
>> default y
>> help
>> Add support for pointer masking in userspace (Supm) when the
>> underlying hardware extension (Smnpm or Ssnpm) is detected
>> at boot.
>>
>> If this option is disabled, userspace will be unable to use
>> the prctl(PR_{SET,GET}_TAGGED_ADDR_CTRL) API.
>>
>> I wonder if we should disable the THP shrinker for such architectures
>> that
>
> I think where possible we really only want to identify problematic
> (tagged) pages and skip them. And we should either look into fixing KSM
> as well or finding out why KSM is not affected.
Yeah. Seems like we could introduce a new helper,
folio_test_mte_tagged(struct
folio *folio). By default, it would return false, and architectures like
arm64
can override it.
Looking at the code, the PG_mte_tagged flag is not set for regular THP.
The MTE
status actually comes from the VM_MTE flag in the VMA that maps it.
static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
{
bool ret = test_bit(PG_mte_tagged, &folio->flags.f);
VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
/*
* If the folio is tagged, ensure ordering with a likely subsequent
* read of the tags.
*/
if (ret)
smp_rmb();
return ret;
}
static inline bool page_mte_tagged(struct page *page)
{
bool ret = test_bit(PG_mte_tagged, &page->flags.f);
VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
/*
* If the page is tagged, ensure ordering with a likely subsequent
* read of the tags.
*/
if (ret)
smp_rmb();
return ret;
}
contpte_set_ptes()
__set_ptes()
__set_ptes_anysz()
__sync_cache_and_tags()
mte_sync_tags()
set_page_mte_tagged()
Then, having the THP shrinker skip any folios that are identified as
MTE-tagged.
Cheers,
Lance
>> I think where possible we really only want to identify problematic
>> (tagged) pages and skip them. And we should either look into fixing KSM
>> as well or finding out why KSM is not affected.
>
> Yeah. Seems like we could introduce a new helper,
> folio_test_mte_tagged(struct
> folio *folio). By default, it would return false, and architectures like
> arm64
> can override it.
If we add a new helper it should instead express the semantics that we cannot deduplicate.
For THP, I recall that only some pages might be tagged. So likely we want to check per page.
>
> Looking at the code, the PG_mte_tagged flag is not set for regular THP.
I think it's supported for THP per page. Only for hugetlb we tag the whole thing through the head page instead of individual pages.
> The MTE
> status actually comes from the VM_MTE flag in the VMA that maps it.
>
During the rmap walk we could check the VMA flag, but there would be no way to just stop the THP shrinker scanning this page early.
> static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
> {
> bool ret = test_bit(PG_mte_tagged, &folio->flags.f);
>
> VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
>
> /*
> * If the folio is tagged, ensure ordering with a likely subsequent
> * read of the tags.
> */
> if (ret)
> smp_rmb();
> return ret;
> }
>
> static inline bool page_mte_tagged(struct page *page)
> {
> bool ret = test_bit(PG_mte_tagged, &page->flags.f);
>
> VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
>
> /*
> * If the page is tagged, ensure ordering with a likely subsequent
> * read of the tags.
> */
> if (ret)
> smp_rmb();
> return ret;
> }
>
> contpte_set_ptes()
> __set_ptes()
> __set_ptes_anysz()
> __sync_cache_and_tags()
> mte_sync_tags()
> set_page_mte_tagged()
>
> Then, having the THP shrinker skip any folios that are identified as
> MTE-tagged.
Likely we should just do something like (maybe we want better naming)
#ifndef page_is_mergable
#define page_is_mergable(page) (true)
#endif
And for arm64 have it be
#define page_is_mergable(page) (!page_mte_tagged(page))
And then do
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1f0813b956436..1cac9093918d6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -4251,7 +4251,8 @@ static bool thp_underused(struct folio *folio)
for (i = 0; i < folio_nr_pages(folio); i++) {
kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
- if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
+ if (page_is_mergable(folio_page(folio, i)) &&
+ !memchr_inv(kaddr, 0, PAGE_SIZE)) {
num_zero_pages++;
if (num_zero_pages > khugepaged_max_ptes_none) {
kunmap_local(kaddr);
diff --git a/mm/migrate.c b/mm/migrate.c
index 946253c398072..476a9a9091bd3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -306,6 +306,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
if (PageCompound(page))
return false;
+ if (!page_is_mergable(page))
+ return false;
VM_BUG_ON_PAGE(!PageAnon(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
For KSM, similarly just bail out early. But still wondering if this is already checked
somehow for KSM.
--
Cheers
David / dhildenb
On 2025/9/19 15:55, David Hildenbrand wrote:
>>> I think where possible we really only want to identify problematic
>>> (tagged) pages and skip them. And we should either look into fixing KSM
>>> as well or finding out why KSM is not affected.
>>
>> Yeah. Seems like we could introduce a new helper,
>> folio_test_mte_tagged(struct
>> folio *folio). By default, it would return false, and architectures like
>> arm64
>> can override it.
>
> If we add a new helper it should instead express the semantics that we
> cannot deduplicate.
Agreed.
>
> For THP, I recall that only some pages might be tagged. So likely we
> want to check per page.
Yes, a per-page check would be simpler.
>
>>
>> Looking at the code, the PG_mte_tagged flag is not set for regular THP.
>
> I think it's supported for THP per page. Only for hugetlb we tag the
> whole thing through the head page instead of individual pages.
Right. That's exactly what I meant.
>
>> The MTE
>> status actually comes from the VM_MTE flag in the VMA that maps it.
>>
>
> During the rmap walk we could check the VMA flag, but there would be no
> way to just stop the THP shrinker scanning this page early.
>
>> static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
>> {
>> bool ret = test_bit(PG_mte_tagged, &folio->flags.f);
>>
>> VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
>>
>> /*
>> * If the folio is tagged, ensure ordering with a likely subsequent
>> * read of the tags.
>> */
>> if (ret)
>> smp_rmb();
>> return ret;
>> }
>>
>> static inline bool page_mte_tagged(struct page *page)
>> {
>> bool ret = test_bit(PG_mte_tagged, &page->flags.f);
>>
>> VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
>>
>> /*
>> * If the page is tagged, ensure ordering with a likely subsequent
>> * read of the tags.
>> */
>> if (ret)
>> smp_rmb();
>> return ret;
>> }
>>
>> contpte_set_ptes()
>> __set_ptes()
>> __set_ptes_anysz()
>> __sync_cache_and_tags()
>> mte_sync_tags()
>> set_page_mte_tagged()
>>
>> Then, having the THP shrinker skip any folios that are identified as
>> MTE-tagged.
>
> Likely we should just do something like (maybe we want better naming)
>
> #ifndef page_is_mergable
> #define page_is_mergable(page) (true)
> #endif
Maybe something like page_is_optimizable()? Just a thought ;p
>
> And for arm64 have it be
>
> #define page_is_mergable(page) (!page_mte_tagged(page))
>
>
> And then do
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 1f0813b956436..1cac9093918d6 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -4251,7 +4251,8 @@ static bool thp_underused(struct folio *folio)
>
> for (i = 0; i < folio_nr_pages(folio); i++) {
> kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
> - if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
> + if (page_is_mergable(folio_page(folio, i)) &&
> + !memchr_inv(kaddr, 0, PAGE_SIZE)) {
> num_zero_pages++;
> if (num_zero_pages > khugepaged_max_ptes_none) {
> kunmap_local(kaddr);
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 946253c398072..476a9a9091bd3 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -306,6 +306,8 @@ static bool try_to_map_unused_to_zeropage(struct
> page_vma_mapped_walk *pvmw,
>
> if (PageCompound(page))
> return false;
> + if (!page_is_mergable(page))
> + return false;
> VM_BUG_ON_PAGE(!PageAnon(page), page);
> VM_BUG_ON_PAGE(!PageLocked(page), page);
> VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
Looks good to me!
>
>
> For KSM, similarly just bail out early. But still wondering if this is
> already checked
> somehow for KSM.
+1 I'm looking for a machine to test it on.
On 2025/9/19 16:14, Lance Yang wrote:
>
>
> On 2025/9/19 15:55, David Hildenbrand wrote:
>>>> I think where possible we really only want to identify problematic
>>>> (tagged) pages and skip them. And we should either look into fixing KSM
>>>> as well or finding out why KSM is not affected.
>>>
>>> Yeah. Seems like we could introduce a new helper,
>>> folio_test_mte_tagged(struct
>>> folio *folio). By default, it would return false, and architectures like
>>> arm64
>>> can override it.
>>
>> If we add a new helper it should instead express the semantics that we
>> cannot deduplicate.
>
> Agreed.
>
>>
>> For THP, I recall that only some pages might be tagged. So likely we
>> want to check per page.
>
> Yes, a per-page check would be simpler.
>
>>
>>>
>>> Looking at the code, the PG_mte_tagged flag is not set for regular THP.
>>
>> I think it's supported for THP per page. Only for hugetlb we tag the
>> whole thing through the head page instead of individual pages.
>
> Right. That's exactly what I meant.
>
>>
>>> The MTE
>>> status actually comes from the VM_MTE flag in the VMA that maps it.
>>>
>>
>> During the rmap walk we could check the VMA flag, but there would be
>> no way to just stop the THP shrinker scanning this page early.
>>
>>> static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
>>> {
>>> bool ret = test_bit(PG_mte_tagged, &folio->flags.f);
>>>
>>> VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
>>>
>>> /*
>>> * If the folio is tagged, ensure ordering with a likely subsequent
>>> * read of the tags.
>>> */
>>> if (ret)
>>> smp_rmb();
>>> return ret;
>>> }
>>>
>>> static inline bool page_mte_tagged(struct page *page)
>>> {
>>> bool ret = test_bit(PG_mte_tagged, &page->flags.f);
>>>
>>> VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
>>>
>>> /*
>>> * If the page is tagged, ensure ordering with a likely subsequent
>>> * read of the tags.
>>> */
>>> if (ret)
>>> smp_rmb();
>>> return ret;
>>> }
>>>
>>> contpte_set_ptes()
>>> __set_ptes()
>>> __set_ptes_anysz()
>>> __sync_cache_and_tags()
>>> mte_sync_tags()
>>> set_page_mte_tagged()
>>>
>>> Then, having the THP shrinker skip any folios that are identified as
>>> MTE-tagged.
>>
>> Likely we should just do something like (maybe we want better naming)
>>
>> #ifndef page_is_mergable
>> #define page_is_mergable(page) (true)
>> #endif
>
>
> Maybe something like page_is_optimizable()? Just a thought ;p
>
>>
>> And for arm64 have it be
>>
>> #define page_is_mergable(page) (!page_mte_tagged(page))
>>
>>
>> And then do
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 1f0813b956436..1cac9093918d6 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -4251,7 +4251,8 @@ static bool thp_underused(struct folio *folio)
>>
>> for (i = 0; i < folio_nr_pages(folio); i++) {
>> kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
>> - if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
>> + if (page_is_mergable(folio_page(folio, i)) &&
>> + !memchr_inv(kaddr, 0, PAGE_SIZE)) {
>> num_zero_pages++;
>> if (num_zero_pages > khugepaged_max_ptes_none) {
>> kunmap_local(kaddr);
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 946253c398072..476a9a9091bd3 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -306,6 +306,8 @@ static bool try_to_map_unused_to_zeropage(struct
>> page_vma_mapped_walk *pvmw,
>>
>> if (PageCompound(page))
>> return false;
>> + if (!page_is_mergable(page))
>> + return false;
>> VM_BUG_ON_PAGE(!PageAnon(page), page);
>> VM_BUG_ON_PAGE(!PageLocked(page), page);
>> VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
>
> Looks good to me!
>
>>
>>
>> For KSM, similarly just bail out early. But still wondering if this is
>> already checked
>> somehow for KSM.
>
> +1 I'm looking for a machine to test it on.
Interestingly, it seems KSM is already skipping MTE-tagged pages. My test,
running on a v6.8.0 kernel inside QEMU (with MTE enabled), shows no merging
activity for those pages ...
Hey David,
I believe I've found the exact reason why KSM skips MTE-tagged pages ;p
>
>
> On 2025/9/19 16:14, Lance Yang wrote:
>>
>>
>> On 2025/9/19 15:55, David Hildenbrand wrote:
>>>>> I think where possible we really only want to identify problematic
>>>>> (tagged) pages and skip them. And we should either look into fixing
>>>>> KSM
>>>>> as well or finding out why KSM is not affected.
>>>>
>>>> Yeah. Seems like we could introduce a new helper,
>>>> folio_test_mte_tagged(struct
>>>> folio *folio). By default, it would return false, and architectures
>>>> like
>>>> arm64
>>>> can override it.
>>>
>>> If we add a new helper it should instead express the semantics that
>>> we cannot deduplicate.
>>
>> Agreed.
>>
>>>
>>> For THP, I recall that only some pages might be tagged. So likely we
>>> want to check per page.
>>
>> Yes, a per-page check would be simpler.
>>
>>>
>>>>
>>>> Looking at the code, the PG_mte_tagged flag is not set for regular THP.
>>>
>>> I think it's supported for THP per page. Only for hugetlb we tag the
>>> whole thing through the head page instead of individual pages.
>>
>> Right. That's exactly what I meant.
>>
>>>
>>>> The MTE
>>>> status actually comes from the VM_MTE flag in the VMA that maps it.
>>>>
>>>
>>> During the rmap walk we could check the VMA flag, but there would be
>>> no way to just stop the THP shrinker scanning this page early.
>>>
>>>> static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
>>>> {
>>>> bool ret = test_bit(PG_mte_tagged, &folio->flags.f);
>>>>
>>>> VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
>>>>
>>>> /*
>>>> * If the folio is tagged, ensure ordering with a likely subsequent
>>>> * read of the tags.
>>>> */
>>>> if (ret)
>>>> smp_rmb();
>>>> return ret;
>>>> }
>>>>
>>>> static inline bool page_mte_tagged(struct page *page)
>>>> {
>>>> bool ret = test_bit(PG_mte_tagged, &page->flags.f);
>>>>
>>>> VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
>>>>
>>>> /*
>>>> * If the page is tagged, ensure ordering with a likely subsequent
>>>> * read of the tags.
>>>> */
>>>> if (ret)
>>>> smp_rmb();
>>>> return ret;
>>>> }
>>>>
>>>> contpte_set_ptes()
>>>> __set_ptes()
>>>> __set_ptes_anysz()
>>>> __sync_cache_and_tags()
>>>> mte_sync_tags()
>>>> set_page_mte_tagged()
>>>>
>>>> Then, having the THP shrinker skip any folios that are identified as
>>>> MTE-tagged.
>>>
>>> Likely we should just do something like (maybe we want better naming)
>>>
>>> #ifndef page_is_mergable
>>> #define page_is_mergable(page) (true)
>>> #endif
>>
>>
>> Maybe something like page_is_optimizable()? Just a thought ;p
>>
>>>
>>> And for arm64 have it be
>>>
>>> #define page_is_mergable(page) (!page_mte_tagged(page))
>>>
>>>
>>> And then do
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 1f0813b956436..1cac9093918d6 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -4251,7 +4251,8 @@ static bool thp_underused(struct folio *folio)
>>>
>>> for (i = 0; i < folio_nr_pages(folio); i++) {
>>> kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
>>> - if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>> + if (page_is_mergable(folio_page(folio, i)) &&
>>> + !memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>> num_zero_pages++;
>>> if (num_zero_pages >
>>> khugepaged_max_ptes_none) {
>>> kunmap_local(kaddr);
>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>> index 946253c398072..476a9a9091bd3 100644
>>> --- a/mm/migrate.c
>>> +++ b/mm/migrate.c
>>> @@ -306,6 +306,8 @@ static bool try_to_map_unused_to_zeropage(struct
>>> page_vma_mapped_walk *pvmw,
>>>
>>> if (PageCompound(page))
>>> return false;
>>> + if (!page_is_mergable(page))
>>> + return false;
>>> VM_BUG_ON_PAGE(!PageAnon(page), page);
>>> VM_BUG_ON_PAGE(!PageLocked(page), page);
>>> VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
>>
>> Looks good to me!
>>
>>>
>>>
>>> For KSM, similarly just bail out early. But still wondering if this
>>> is already checked
>>> somehow for KSM.
>>
>> +1 I'm looking for a machine to test it on.
>
> Interestingly, it seems KSM is already skipping MTE-tagged pages. My test,
> running on a v6.8.0 kernel inside QEMU (with MTE enabled), shows no merging
> activity for those pages ...
KSM's call to pages_identical() ultimately leads to memcmp_pages(). The
arm64 implementation of memcmp_pages() in arch/arm64/kernel/mte.c contains
a specific check that prevents merging in this case.
try_to_merge_one_page()
-> pages_identical()
-> !memcmp_pages() Fails!
-> replace_page()
int memcmp_pages(struct page *page1, struct page *page2)
{
char *addr1, *addr2;
int ret;
addr1 = page_address(page1);
addr2 = page_address(page2);
ret = memcmp(addr1, addr2, PAGE_SIZE);
if (!system_supports_mte() || ret)
return ret;
/*
* If the page content is identical but at least one of the pages is
* tagged, return non-zero to avoid KSM merging. If only one of the
* pages is tagged, __set_ptes() may zero or change the tags of the
* other page via mte_sync_tags().
*/
if (page_mte_tagged(page1) || page_mte_tagged(page2))
return addr1 != addr2;
return ret;
}
IIUC, if either page is MTE-tagged, memcmp_pages() intentionally returns
a non-zero value, which in turn causes pages_identical() to return false.
Cheers,
Lance
On 19.09.25 14:19, Lance Yang wrote:
> Hey David,
>
> I believe I've found the exact reason why KSM skips MTE-tagged pages ;p
>
>>
>>
>> On 2025/9/19 16:14, Lance Yang wrote:
>>>
>>>
>>> On 2025/9/19 15:55, David Hildenbrand wrote:
>>>>>> I think where possible we really only want to identify problematic
>>>>>> (tagged) pages and skip them. And we should either look into fixing
>>>>>> KSM
>>>>>> as well or finding out why KSM is not affected.
>>>>>
>>>>> Yeah. Seems like we could introduce a new helper,
>>>>> folio_test_mte_tagged(struct
>>>>> folio *folio). By default, it would return false, and architectures
>>>>> like
>>>>> arm64
>>>>> can override it.
>>>>
>>>> If we add a new helper it should instead express the semantics that
>>>> we cannot deduplicate.
>>>
>>> Agreed.
>>>
>>>>
>>>> For THP, I recall that only some pages might be tagged. So likely we
>>>> want to check per page.
>>>
>>> Yes, a per-page check would be simpler.
>>>
>>>>
>>>>>
>>>>> Looking at the code, the PG_mte_tagged flag is not set for regular THP.
>>>>
>>>> I think it's supported for THP per page. Only for hugetlb we tag the
>>>> whole thing through the head page instead of individual pages.
>>>
>>> Right. That's exactly what I meant.
>>>
>>>>
>>>>> The MTE
>>>>> status actually comes from the VM_MTE flag in the VMA that maps it.
>>>>>
>>>>
>>>> During the rmap walk we could check the VMA flag, but there would be
>>>> no way to just stop the THP shrinker scanning this page early.
>>>>
>>>>> static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
>>>>> {
>>>>> bool ret = test_bit(PG_mte_tagged, &folio->flags.f);
>>>>>
>>>>> VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
>>>>>
>>>>> /*
>>>>> * If the folio is tagged, ensure ordering with a likely subsequent
>>>>> * read of the tags.
>>>>> */
>>>>> if (ret)
>>>>> smp_rmb();
>>>>> return ret;
>>>>> }
>>>>>
>>>>> static inline bool page_mte_tagged(struct page *page)
>>>>> {
>>>>> bool ret = test_bit(PG_mte_tagged, &page->flags.f);
>>>>>
>>>>> VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
>>>>>
>>>>> /*
>>>>> * If the page is tagged, ensure ordering with a likely subsequent
>>>>> * read of the tags.
>>>>> */
>>>>> if (ret)
>>>>> smp_rmb();
>>>>> return ret;
>>>>> }
>>>>>
>>>>> contpte_set_ptes()
>>>>> __set_ptes()
>>>>> __set_ptes_anysz()
>>>>> __sync_cache_and_tags()
>>>>> mte_sync_tags()
>>>>> set_page_mte_tagged()
>>>>>
>>>>> Then, having the THP shrinker skip any folios that are identified as
>>>>> MTE-tagged.
>>>>
>>>> Likely we should just do something like (maybe we want better naming)
>>>>
>>>> #ifndef page_is_mergable
>>>> #define page_is_mergable(page) (true)
>>>> #endif
>>>
>>>
>>> Maybe something like page_is_optimizable()? Just a thought ;p
>>>
>>>>
>>>> And for arm64 have it be
>>>>
>>>> #define page_is_mergable(page) (!page_mte_tagged(page))
>>>>
>>>>
>>>> And then do
>>>>
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 1f0813b956436..1cac9093918d6 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -4251,7 +4251,8 @@ static bool thp_underused(struct folio *folio)
>>>>
>>>> for (i = 0; i < folio_nr_pages(folio); i++) {
>>>> kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
>>>> - if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>>> + if (page_is_mergable(folio_page(folio, i)) &&
>>>> + !memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>>> num_zero_pages++;
>>>> if (num_zero_pages >
>>>> khugepaged_max_ptes_none) {
>>>> kunmap_local(kaddr);
>>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>>> index 946253c398072..476a9a9091bd3 100644
>>>> --- a/mm/migrate.c
>>>> +++ b/mm/migrate.c
>>>> @@ -306,6 +306,8 @@ static bool try_to_map_unused_to_zeropage(struct
>>>> page_vma_mapped_walk *pvmw,
>>>>
>>>> if (PageCompound(page))
>>>> return false;
>>>> + if (!page_is_mergable(page))
>>>> + return false;
>>>> VM_BUG_ON_PAGE(!PageAnon(page), page);
>>>> VM_BUG_ON_PAGE(!PageLocked(page), page);
>>>> VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
>>>
>>> Looks good to me!
>>>
>>>>
>>>>
>>>> For KSM, similarly just bail out early. But still wondering if this
>>>> is already checked
>>>> somehow for KSM.
>>>
>>> +1 I'm looking for a machine to test it on.
>>
>> Interestingly, it seems KSM is already skipping MTE-tagged pages. My test,
>> running on a v6.8.0 kernel inside QEMU (with MTE enabled), shows no merging
>> activity for those pages ...
>
> KSM's call to pages_identical() ultimately leads to memcmp_pages(). The
> arm64 implementation of memcmp_pages() in arch/arm64/kernel/mte.c contains
> a specific check that prevents merging in this case.
>
> try_to_merge_one_page()
> -> pages_identical()
> -> !memcmp_pages() Fails!
> -> replace_page()
>
>
> int memcmp_pages(struct page *page1, struct page *page2)
> {
> char *addr1, *addr2;
> int ret;
>
> addr1 = page_address(page1);
> addr2 = page_address(page2);
> ret = memcmp(addr1, addr2, PAGE_SIZE);
>
> if (!system_supports_mte() || ret)
> return ret;
>
> /*
> * If the page content is identical but at least one of the pages is
> * tagged, return non-zero to avoid KSM merging. If only one of the
> * pages is tagged, __set_ptes() may zero or change the tags of the
> * other page via mte_sync_tags().
> */
> if (page_mte_tagged(page1) || page_mte_tagged(page2))
> return addr1 != addr2;
>
> return ret;
> }
>
> IIUC, if either page is MTE-tagged, memcmp_pages() intentionally returns
> a non-zero value, which in turn causes pages_identical() to return false.
Cool, so we should likely just use that then in the shrinker code. Can
you send a fix?
--
Cheers
David / dhildenb
On 2025/9/19 21:09, David Hildenbrand wrote:
> On 19.09.25 14:19, Lance Yang wrote:
>> Hey David,
>>
>> I believe I've found the exact reason why KSM skips MTE-tagged pages ;p
>>
>>>
>>>
>>> On 2025/9/19 16:14, Lance Yang wrote:
>>>>
>>>>
>>>> On 2025/9/19 15:55, David Hildenbrand wrote:
>>>>>>> I think where possible we really only want to identify problematic
>>>>>>> (tagged) pages and skip them. And we should either look into fixing
>>>>>>> KSM
>>>>>>> as well or finding out why KSM is not affected.
>>>>>>
>>>>>> Yeah. Seems like we could introduce a new helper,
>>>>>> folio_test_mte_tagged(struct
>>>>>> folio *folio). By default, it would return false, and architectures
>>>>>> like
>>>>>> arm64
>>>>>> can override it.
>>>>>
>>>>> If we add a new helper it should instead express the semantics that
>>>>> we cannot deduplicate.
>>>>
>>>> Agreed.
>>>>
>>>>>
>>>>> For THP, I recall that only some pages might be tagged. So likely we
>>>>> want to check per page.
>>>>
>>>> Yes, a per-page check would be simpler.
>>>>
>>>>>
>>>>>>
>>>>>> Looking at the code, the PG_mte_tagged flag is not set for regular
>>>>>> THP.
>>>>>
>>>>> I think it's supported for THP per page. Only for hugetlb we tag the
>>>>> whole thing through the head page instead of individual pages.
>>>>
>>>> Right. That's exactly what I meant.
>>>>
>>>>>
>>>>>> The MTE
>>>>>> status actually comes from the VM_MTE flag in the VMA that maps it.
>>>>>>
>>>>>
>>>>> During the rmap walk we could check the VMA flag, but there would be
>>>>> no way to just stop the THP shrinker scanning this page early.
>>>>>
>>>>>> static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
>>>>>> {
>>>>>> bool ret = test_bit(PG_mte_tagged, &folio->flags.f);
>>>>>>
>>>>>> VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
>>>>>>
>>>>>> /*
>>>>>> * If the folio is tagged, ensure ordering with a likely
>>>>>> subsequent
>>>>>> * read of the tags.
>>>>>> */
>>>>>> if (ret)
>>>>>> smp_rmb();
>>>>>> return ret;
>>>>>> }
>>>>>>
>>>>>> static inline bool page_mte_tagged(struct page *page)
>>>>>> {
>>>>>> bool ret = test_bit(PG_mte_tagged, &page->flags.f);
>>>>>>
>>>>>> VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
>>>>>>
>>>>>> /*
>>>>>> * If the page is tagged, ensure ordering with a likely
>>>>>> subsequent
>>>>>> * read of the tags.
>>>>>> */
>>>>>> if (ret)
>>>>>> smp_rmb();
>>>>>> return ret;
>>>>>> }
>>>>>>
>>>>>> contpte_set_ptes()
>>>>>> __set_ptes()
>>>>>> __set_ptes_anysz()
>>>>>> __sync_cache_and_tags()
>>>>>> mte_sync_tags()
>>>>>> set_page_mte_tagged()
>>>>>>
>>>>>> Then, having the THP shrinker skip any folios that are identified as
>>>>>> MTE-tagged.
>>>>>
>>>>> Likely we should just do something like (maybe we want better naming)
>>>>>
>>>>> #ifndef page_is_mergable
>>>>> #define page_is_mergable(page) (true)
>>>>> #endif
>>>>
>>>>
>>>> Maybe something like page_is_optimizable()? Just a thought ;p
>>>>
>>>>>
>>>>> And for arm64 have it be
>>>>>
>>>>> #define page_is_mergable(page) (!page_mte_tagged(page))
>>>>>
>>>>>
>>>>> And then do
>>>>>
>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>> index 1f0813b956436..1cac9093918d6 100644
>>>>> --- a/mm/huge_memory.c
>>>>> +++ b/mm/huge_memory.c
>>>>> @@ -4251,7 +4251,8 @@ static bool thp_underused(struct folio *folio)
>>>>>
>>>>> for (i = 0; i < folio_nr_pages(folio); i++) {
>>>>> kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
>>>>> - if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>>>> + if (page_is_mergable(folio_page(folio, i)) &&
>>>>> + !memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>>>> num_zero_pages++;
>>>>> if (num_zero_pages >
>>>>> khugepaged_max_ptes_none) {
>>>>> kunmap_local(kaddr);
>>>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>>>> index 946253c398072..476a9a9091bd3 100644
>>>>> --- a/mm/migrate.c
>>>>> +++ b/mm/migrate.c
>>>>> @@ -306,6 +306,8 @@ static bool try_to_map_unused_to_zeropage(struct
>>>>> page_vma_mapped_walk *pvmw,
>>>>>
>>>>> if (PageCompound(page))
>>>>> return false;
>>>>> + if (!page_is_mergable(page))
>>>>> + return false;
>>>>> VM_BUG_ON_PAGE(!PageAnon(page), page);
>>>>> VM_BUG_ON_PAGE(!PageLocked(page), page);
>>>>> VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
>>>>
>>>> Looks good to me!
>>>>
>>>>>
>>>>>
>>>>> For KSM, similarly just bail out early. But still wondering if this
>>>>> is already checked
>>>>> somehow for KSM.
>>>>
>>>> +1 I'm looking for a machine to test it on.
>>>
>>> Interestingly, it seems KSM is already skipping MTE-tagged pages. My
>>> test,
>>> running on a v6.8.0 kernel inside QEMU (with MTE enabled), shows no
>>> merging
>>> activity for those pages ...
>>
>> KSM's call to pages_identical() ultimately leads to memcmp_pages(). The
>> arm64 implementation of memcmp_pages() in arch/arm64/kernel/mte.c
>> contains
>> a specific check that prevents merging in this case.
>>
>> try_to_merge_one_page()
>> -> pages_identical()
>> -> !memcmp_pages() Fails!
>> -> replace_page()
>>
>>
>> int memcmp_pages(struct page *page1, struct page *page2)
>> {
>> char *addr1, *addr2;
>> int ret;
>>
>> addr1 = page_address(page1);
>> addr2 = page_address(page2);
>> ret = memcmp(addr1, addr2, PAGE_SIZE);
>>
>> if (!system_supports_mte() || ret)
>> return ret;
>>
>> /*
>> * If the page content is identical but at least one of the pages is
>> * tagged, return non-zero to avoid KSM merging. If only one of the
>> * pages is tagged, __set_ptes() may zero or change the tags of the
>> * other page via mte_sync_tags().
>> */
>> if (page_mte_tagged(page1) || page_mte_tagged(page2))
>> return addr1 != addr2;
>>
>> return ret;
>> }
>>
>> IIUC, if either page is MTE-tagged, memcmp_pages() intentionally returns
>> a non-zero value, which in turn causes pages_identical() to return false.
>
> Cool, so we should likely just use that then in the shrinker code. Can
> you send a fix?
Certainly! I'll get on that ;p
Cheers,
Lance
On 2025/9/19 20:19, Lance Yang wrote:
> Hey David,
>
> I believe I've found the exact reason why KSM skips MTE-tagged pages ;p
>
>>
>>
>> On 2025/9/19 16:14, Lance Yang wrote:
>>>
>>>
>>> On 2025/9/19 15:55, David Hildenbrand wrote:
>>>>>> I think where possible we really only want to identify problematic
>>>>>> (tagged) pages and skip them. And we should either look into
>>>>>> fixing KSM
>>>>>> as well or finding out why KSM is not affected.
>>>>>
>>>>> Yeah. Seems like we could introduce a new helper,
>>>>> folio_test_mte_tagged(struct
>>>>> folio *folio). By default, it would return false, and architectures
>>>>> like
>>>>> arm64
>>>>> can override it.
>>>>
>>>> If we add a new helper it should instead express the semantics that
>>>> we cannot deduplicate.
>>>
>>> Agreed.
>>>
>>>>
>>>> For THP, I recall that only some pages might be tagged. So likely we
>>>> want to check per page.
>>>
>>> Yes, a per-page check would be simpler.
>>>
>>>>
>>>>>
>>>>> Looking at the code, the PG_mte_tagged flag is not set for regular
>>>>> THP.
>>>>
>>>> I think it's supported for THP per page. Only for hugetlb we tag the
>>>> whole thing through the head page instead of individual pages.
>>>
>>> Right. That's exactly what I meant.
>>>
>>>>
>>>>> The MTE
>>>>> status actually comes from the VM_MTE flag in the VMA that maps it.
>>>>>
>>>>
>>>> During the rmap walk we could check the VMA flag, but there would be
>>>> no way to just stop the THP shrinker scanning this page early.
>>>>
>>>>> static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
>>>>> {
>>>>> bool ret = test_bit(PG_mte_tagged, &folio->flags.f);
>>>>>
>>>>> VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));
>>>>>
>>>>> /*
>>>>> * If the folio is tagged, ensure ordering with a likely
>>>>> subsequent
>>>>> * read of the tags.
>>>>> */
>>>>> if (ret)
>>>>> smp_rmb();
>>>>> return ret;
>>>>> }
>>>>>
>>>>> static inline bool page_mte_tagged(struct page *page)
>>>>> {
>>>>> bool ret = test_bit(PG_mte_tagged, &page->flags.f);
>>>>>
>>>>> VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));
>>>>>
>>>>> /*
>>>>> * If the page is tagged, ensure ordering with a likely subsequent
>>>>> * read of the tags.
>>>>> */
>>>>> if (ret)
>>>>> smp_rmb();
>>>>> return ret;
>>>>> }
>>>>>
>>>>> contpte_set_ptes()
>>>>> __set_ptes()
>>>>> __set_ptes_anysz()
>>>>> __sync_cache_and_tags()
>>>>> mte_sync_tags()
>>>>> set_page_mte_tagged()
>>>>>
>>>>> Then, having the THP shrinker skip any folios that are identified as
>>>>> MTE-tagged.
>>>>
>>>> Likely we should just do something like (maybe we want better naming)
>>>>
>>>> #ifndef page_is_mergable
>>>> #define page_is_mergable(page) (true)
>>>> #endif
>>>
>>>
>>> Maybe something like page_is_optimizable()? Just a thought ;p
>>>
>>>>
>>>> And for arm64 have it be
>>>>
>>>> #define page_is_mergable(page) (!page_mte_tagged(page))
>>>>
>>>>
>>>> And then do
>>>>
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 1f0813b956436..1cac9093918d6 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -4251,7 +4251,8 @@ static bool thp_underused(struct folio *folio)
>>>>
>>>> for (i = 0; i < folio_nr_pages(folio); i++) {
>>>> kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
>>>> - if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>>> + if (page_is_mergable(folio_page(folio, i)) &&
>>>> + !memchr_inv(kaddr, 0, PAGE_SIZE)) {
>>>> num_zero_pages++;
>>>> if (num_zero_pages >
>>>> khugepaged_max_ptes_none) {
>>>> kunmap_local(kaddr);
>>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>>> index 946253c398072..476a9a9091bd3 100644
>>>> --- a/mm/migrate.c
>>>> +++ b/mm/migrate.c
>>>> @@ -306,6 +306,8 @@ static bool try_to_map_unused_to_zeropage(struct
>>>> page_vma_mapped_walk *pvmw,
>>>>
>>>> if (PageCompound(page))
>>>> return false;
>>>> + if (!page_is_mergable(page))
>>>> + return false;
>>>> VM_BUG_ON_PAGE(!PageAnon(page), page);
>>>> VM_BUG_ON_PAGE(!PageLocked(page), page);
>>>> VM_BUG_ON_PAGE(pte_present(ptep_get(pvmw->pte)), page);
>>>
>>> Looks good to me!
>>>
>>>>
>>>>
>>>> For KSM, similarly just bail out early. But still wondering if this
>>>> is already checked
>>>> somehow for KSM.
>>>
>>> +1 I'm looking for a machine to test it on.
>>
>> Interestingly, it seems KSM is already skipping MTE-tagged pages. My
>> test,
>> running on a v6.8.0 kernel inside QEMU (with MTE enabled), shows no
>> merging
>> activity for those pages ...
>
> KSM's call to pages_identical() ultimately leads to memcmp_pages(). The
> arm64 implementation of memcmp_pages() in arch/arm64/kernel/mte.c contains
> a specific check that prevents merging in this case.
>
> try_to_merge_one_page()
> -> pages_identical()
> -> !memcmp_pages() Fails!
> -> replace_page()
Forgot to add:
memcmp_pages() is also called in other KSM paths, such as
stable_tree_search(), stable_tree_insert(), and
unstable_tree_search_insert(), effectively blocking MTE-tagged
pages from entering either of KSM's trees.
>
>
> int memcmp_pages(struct page *page1, struct page *page2)
> {
> char *addr1, *addr2;
> int ret;
>
> addr1 = page_address(page1);
> addr2 = page_address(page2);
> ret = memcmp(addr1, addr2, PAGE_SIZE);
>
> if (!system_supports_mte() || ret)
> return ret;
>
> /*
> * If the page content is identical but at least one of the pages is
> * tagged, return non-zero to avoid KSM merging. If only one of the
> * pages is tagged, __set_ptes() may zero or change the tags of the
> * other page via mte_sync_tags().
> */
> if (page_mte_tagged(page1) || page_mte_tagged(page2))
> return addr1 != addr2;
>
> return ret;
> }
>
> IIUC, if either page is MTE-tagged, memcmp_pages() intentionally returns
> a non-zero value, which in turn causes pages_identical() to return false.
>
> Cheers,
> Lance
On Thu, Sep 18, 2025 at 8:22 PM Lance Yang <lance.yang@linux.dev> wrote:
>
> On Thu, Sep 18, 2025 at 5:21 PM David Hildenbrand <david@redhat.com> wrote:
> >
> > On 18.09.25 10:53, Qun-wei Lin (林群崴) wrote:
> > > On Fri, 2024-08-30 at 11:03 +0100, Usama Arif wrote:
> > >> From: Yu Zhao <yuzhao@google.com>
> > >>
> > >> Here being unused means containing only zeros and inaccessible to
> > >> userspace. When splitting an isolated thp under reclaim or migration,
> > >> the unused subpages can be mapped to the shared zeropage, hence
> > >> saving
> > >> memory. This is particularly helpful when the internal
> > >> fragmentation of a thp is high, i.e. it has many untouched subpages.
> > >>
> > >> This is also a prerequisite for THP low utilization shrinker which
> > >> will
> > >> be introduced in later patches, where underutilized THPs are split,
> > >> and
> > >> the zero-filled pages are freed saving memory.
> > >>
> > >> Signed-off-by: Yu Zhao <yuzhao@google.com>
> > >> Tested-by: Shuang Zhai <zhais@google.com>
> > >> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> > >> ---
> > >> include/linux/rmap.h | 7 ++++-
> > >> mm/huge_memory.c | 8 ++---
> > >> mm/migrate.c | 72 ++++++++++++++++++++++++++++++++++++++----
> > >> --
> > >> mm/migrate_device.c | 4 +--
> > >> 4 files changed, 75 insertions(+), 16 deletions(-)
> > >>
> > >> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> > >> index 91b5935e8485..d5e93e44322e 100644
> > >> --- a/include/linux/rmap.h
> > >> +++ b/include/linux/rmap.h
> > >> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
> > >> int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages,
> > >> pgoff_t pgoff,
> > >> struct vm_area_struct *vma);
> > >>
> > >> -void remove_migration_ptes(struct folio *src, struct folio *dst,
> > >> bool locked);
> > >> +enum rmp_flags {
> > >> + RMP_LOCKED = 1 << 0,
> > >> + RMP_USE_SHARED_ZEROPAGE = 1 << 1,
> > >> +};
> > >> +
> > >> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
> > >> flags);
> > >>
> > >> /*
> > >> * rmap_walk_control: To control rmap traversing for specific needs
> > >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > >> index 0c48806ccb9a..af60684e7c70 100644
> > >> --- a/mm/huge_memory.c
> > >> +++ b/mm/huge_memory.c
> > >> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct
> > >> vm_area_struct *vma, unsigned long addr,
> > >> return false;
> > >> }
> > >>
> > >> -static void remap_page(struct folio *folio, unsigned long nr)
> > >> +static void remap_page(struct folio *folio, unsigned long nr, int
> > >> flags)
> > >> {
> > >> int i = 0;
> > >>
> > >> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio,
> > >> unsigned long nr)
> > >> if (!folio_test_anon(folio))
> > >> return;
> > >> for (;;) {
> > >> - remove_migration_ptes(folio, folio, true);
> > >> + remove_migration_ptes(folio, folio, RMP_LOCKED |
> > >> flags);
> > >> i += folio_nr_pages(folio);
> > >> if (i >= nr)
> > >> break;
> > >> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page
> > >> *page, struct list_head *list,
> > >>
> > >> if (nr_dropped)
> > >> shmem_uncharge(folio->mapping->host, nr_dropped);
> > >> - remap_page(folio, nr);
> > >> + remap_page(folio, nr, PageAnon(head) ?
> > >> RMP_USE_SHARED_ZEROPAGE : 0);
> > >>
> > >> /*
> > >> * set page to its compound_head when split to non order-0
> > >> pages, so
> > >> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct
> > >> page *page, struct list_head *list,
> > >> if (mapping)
> > >> xas_unlock(&xas);
> > >> local_irq_enable();
> > >> - remap_page(folio, folio_nr_pages(folio));
> > >> + remap_page(folio, folio_nr_pages(folio), 0);
> > >> ret = -EAGAIN;
> > >> }
> > >>
> > >> diff --git a/mm/migrate.c b/mm/migrate.c
> > >> index 6f9c62c746be..d039863e014b 100644
> > >> --- a/mm/migrate.c
> > >> +++ b/mm/migrate.c
> > >> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio,
> > >> struct list_head *list)
> > >> return true;
> > >> }
> > >>
> > >> +static bool try_to_map_unused_to_zeropage(struct
> > >> page_vma_mapped_walk *pvmw,
> > >> + struct folio *folio,
> > >> + unsigned long idx)
> > >> +{
> > >> + struct page *page = folio_page(folio, idx);
> > >> + bool contains_data;
> > >> + pte_t newpte;
> > >> + void *addr;
> > >> +
> > >> + VM_BUG_ON_PAGE(PageCompound(page), page);
> > >> + VM_BUG_ON_PAGE(!PageAnon(page), page);
> > >> + VM_BUG_ON_PAGE(!PageLocked(page), page);
> > >> + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
> > >> +
> > >> + if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags &
> > >> VM_LOCKED) ||
> > >> + mm_forbids_zeropage(pvmw->vma->vm_mm))
> > >> + return false;
> > >> +
> > >> + /*
> > >> + * The pmd entry mapping the old thp was flushed and the pte
> > >> mapping
> > >> + * this subpage has been non present. If the subpage is only
> > >> zero-filled
> > >> + * then map it to the shared zeropage.
> > >> + */
> > >> + addr = kmap_local_page(page);
> > >> + contains_data = memchr_inv(addr, 0, PAGE_SIZE);
> > >> + kunmap_local(addr);
> > >> +
> > >> + if (contains_data)
> > >> + return false;
> > >> +
> > >> + newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
> > >> + pvmw->vma->vm_page_prot));
> > >> + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte,
> > >> newpte);
> > >> +
> > >> + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
> > >> + return true;
> > >> +}
> > >> +
> > >> +struct rmap_walk_arg {
> > >> + struct folio *folio;
> > >> + bool map_unused_to_zeropage;
> > >> +};
> > >> +
> > >> /*
> > >> * Restore a potential migration pte to a working pte entry
> > >> */
> > >> static bool remove_migration_pte(struct folio *folio,
> > >> - struct vm_area_struct *vma, unsigned long addr, void
> > >> *old)
> > >> + struct vm_area_struct *vma, unsigned long addr, void
> > >> *arg)
> > >> {
> > >> - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC |
> > >> PVMW_MIGRATION);
> > >> + struct rmap_walk_arg *rmap_walk_arg = arg;
> > >> + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr,
> > >> PVMW_SYNC | PVMW_MIGRATION);
> > >>
> > >> while (page_vma_mapped_walk(&pvmw)) {
> > >> rmap_t rmap_flags = RMAP_NONE;
> > >> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio
> > >> *folio,
> > >> continue;
> > >> }
> > >> #endif
> > >> + if (rmap_walk_arg->map_unused_to_zeropage &&
> > >> + try_to_map_unused_to_zeropage(&pvmw, folio,
> > >> idx))
> > >> + continue;
> > >>
> > >> folio_get(folio);
> > >> pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
> > >> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio
> > >> *folio,
> > >> * Get rid of all migration entries and replace them by
> > >> * references to the indicated page.
> > >> */
> > >> -void remove_migration_ptes(struct folio *src, struct folio *dst,
> > >> bool locked)
> > >> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
> > >> flags)
> > >> {
> > >> + struct rmap_walk_arg rmap_walk_arg = {
> > >> + .folio = src,
> > >> + .map_unused_to_zeropage = flags &
> > >> RMP_USE_SHARED_ZEROPAGE,
> > >> + };
> > >> +
> > >> struct rmap_walk_control rwc = {
> > >> .rmap_one = remove_migration_pte,
> > >> - .arg = src,
> > >> + .arg = &rmap_walk_arg,
> > >> };
> > >>
> > >> - if (locked)
> > >> + VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src !=
> > >> dst), src);
> > >> +
> > >> + if (flags & RMP_LOCKED)
> > >> rmap_walk_locked(dst, &rwc);
> > >> else
> > >> rmap_walk(dst, &rwc);
> > >> @@ -934,7 +988,7 @@ static int writeout(struct address_space
> > >> *mapping, struct folio *folio)
> > >> * At this point we know that the migration attempt cannot
> > >> * be successful.
> > >> */
> > >> - remove_migration_ptes(folio, folio, false);
> > >> + remove_migration_ptes(folio, folio, 0);
> > >>
> > >> rc = mapping->a_ops->writepage(&folio->page, &wbc);
> > >>
> > >> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio
> > >> *src,
> > >> struct list_head *ret)
> > >> {
> > >> if (page_was_mapped)
> > >> - remove_migration_ptes(src, src, false);
> > >> + remove_migration_ptes(src, src, 0);
> > >> /* Drop an anon_vma reference if we took one */
> > >> if (anon_vma)
> > >> put_anon_vma(anon_vma);
> > >> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t
> > >> put_new_folio, unsigned long private,
> > >> lru_add_drain();
> > >>
> > >> if (old_page_state & PAGE_WAS_MAPPED)
> > >> - remove_migration_ptes(src, dst, false);
> > >> + remove_migration_ptes(src, dst, 0);
> > >>
> > >> out_unlock_both:
> > >> folio_unlock(dst);
> > >> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t
> > >> get_new_folio,
> > >>
> > >> if (page_was_mapped)
> > >> remove_migration_ptes(src,
> > >> - rc == MIGRATEPAGE_SUCCESS ? dst : src,
> > >> false);
> > >> + rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
> > >>
> > >> unlock_put_anon:
> > >> folio_unlock(dst);
> > >> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> > >> index 8d687de88a03..9cf26592ac93 100644
> > >> --- a/mm/migrate_device.c
> > >> +++ b/mm/migrate_device.c
> > >> @@ -424,7 +424,7 @@ static unsigned long
> > >> migrate_device_unmap(unsigned long *src_pfns,
> > >> continue;
> > >>
> > >> folio = page_folio(page);
> > >> - remove_migration_ptes(folio, folio, false);
> > >> + remove_migration_ptes(folio, folio, 0);
> > >>
> > >> src_pfns[i] = 0;
> > >> folio_unlock(folio);
> > >> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long
> > >> *src_pfns,
> > >> dst = src;
> > >> }
> > >>
> > >> - remove_migration_ptes(src, dst, false);
> > >> + remove_migration_ptes(src, dst, 0);
> > >> folio_unlock(src);
> > >>
> > >> if (folio_is_zone_device(src))
> > >
> > > Hi,
> > >
> > > This patch has been in the mainline for some time, but we recently
> > > discovered an issue when both mTHP and MTE (Memory Tagging Extension)
> > > are enabled.
> > >
> > > It seems that remapping to the same zeropage might causes MTE tag
> > > mismatches, since MTE tags are associated with physical addresses.
> >
> > Does this only trigger when the VMA has mte enabled? Maybe we'll have to
> > bail out if we detect that mte is enabled.
>
> It seems RISC-V also has a similar feature (RISCV_ISA_SUPM) that uses
> the same prctl(PR_{GET,SET}_TAGGED_ADDR_CTRL) API.
>
> config RISCV_ISA_SUPM
> bool "Supm extension for userspace pointer masking"
> depends on 64BIT
> default y
> help
> Add support for pointer masking in userspace (Supm) when the
> underlying hardware extension (Smnpm or Ssnpm) is detected at boot.
>
> If this option is disabled, userspace will be unable to use
> the prctl(PR_{SET,GET}_TAGGED_ADDR_CTRL) API.
>
> I wonder if we should disable the THP shrinker for such architectures that
> define PR_SET_TAGGED_ADDR_CTRL (or PR_GET_TAGGED_ADDR_CTRL).
SET_TAGGED_ADDR_CTRL or GET_TAGGED_ADDR_CTRL
File: kernel/sys.c 114
#ifndef SET_TAGGED_ADDR_CTRL
# define SET_TAGGED_ADDR_CTRL(a) (-EINVAL)
#endif
#ifndef GET_TAGGED_ADDR_CTRL
# define GET_TAGGED_ADDR_CTRL() (-EINVAL)
#endif
Cheers,
Lance
>
> Cheers,
> Lance
>
> >
> > Also, I wonder how KSM and the shared zeropage works in general with
> > that, because I would expect similar issues when we de-duplicate memory?
> >
> > --
> > Cheers
> >
> > David / dhildenb
> >
> >
On 18/09/2025 09:56, David Hildenbrand wrote:
> On 18.09.25 10:53, Qun-wei Lin (林群崴) wrote:
>> On Fri, 2024-08-30 at 11:03 +0100, Usama Arif wrote:
>>> From: Yu Zhao <yuzhao@google.com>
>>>
>>> Here being unused means containing only zeros and inaccessible to
>>> userspace. When splitting an isolated thp under reclaim or migration,
>>> the unused subpages can be mapped to the shared zeropage, hence
>>> saving
>>> memory. This is particularly helpful when the internal
>>> fragmentation of a thp is high, i.e. it has many untouched subpages.
>>>
>>> This is also a prerequisite for THP low utilization shrinker which
>>> will
>>> be introduced in later patches, where underutilized THPs are split,
>>> and
>>> the zero-filled pages are freed saving memory.
>>>
>>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>>> Tested-by: Shuang Zhai <zhais@google.com>
>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>> ---
>>> include/linux/rmap.h | 7 ++++-
>>> mm/huge_memory.c | 8 ++---
>>> mm/migrate.c | 72 ++++++++++++++++++++++++++++++++++++++----
>>> --
>>> mm/migrate_device.c | 4 +--
>>> 4 files changed, 75 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>> index 91b5935e8485..d5e93e44322e 100644
>>> --- a/include/linux/rmap.h
>>> +++ b/include/linux/rmap.h
>>> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>>> int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages,
>>> pgoff_t pgoff,
>>> struct vm_area_struct *vma);
>>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>>> bool locked);
>>> +enum rmp_flags {
>>> + RMP_LOCKED = 1 << 0,
>>> + RMP_USE_SHARED_ZEROPAGE = 1 << 1,
>>> +};
>>> +
>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>>> flags);
>>> /*
>>> * rmap_walk_control: To control rmap traversing for specific needs
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 0c48806ccb9a..af60684e7c70 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct
>>> vm_area_struct *vma, unsigned long addr,
>>> return false;
>>> }
>>> -static void remap_page(struct folio *folio, unsigned long nr)
>>> +static void remap_page(struct folio *folio, unsigned long nr, int
>>> flags)
>>> {
>>> int i = 0;
>>> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio,
>>> unsigned long nr)
>>> if (!folio_test_anon(folio))
>>> return;
>>> for (;;) {
>>> - remove_migration_ptes(folio, folio, true);
>>> + remove_migration_ptes(folio, folio, RMP_LOCKED |
>>> flags);
>>> i += folio_nr_pages(folio);
>>> if (i >= nr)
>>> break;
>>> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page
>>> *page, struct list_head *list,
>>> if (nr_dropped)
>>> shmem_uncharge(folio->mapping->host, nr_dropped);
>>> - remap_page(folio, nr);
>>> + remap_page(folio, nr, PageAnon(head) ?
>>> RMP_USE_SHARED_ZEROPAGE : 0);
>>> /*
>>> * set page to its compound_head when split to non order-0
>>> pages, so
>>> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct
>>> page *page, struct list_head *list,
>>> if (mapping)
>>> xas_unlock(&xas);
>>> local_irq_enable();
>>> - remap_page(folio, folio_nr_pages(folio));
>>> + remap_page(folio, folio_nr_pages(folio), 0);
>>> ret = -EAGAIN;
>>> }
>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>> index 6f9c62c746be..d039863e014b 100644
>>> --- a/mm/migrate.c
>>> +++ b/mm/migrate.c
>>> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio,
>>> struct list_head *list)
>>> return true;
>>> }
>>> +static bool try_to_map_unused_to_zeropage(struct
>>> page_vma_mapped_walk *pvmw,
>>> + struct folio *folio,
>>> + unsigned long idx)
>>> +{
>>> + struct page *page = folio_page(folio, idx);
>>> + bool contains_data;
>>> + pte_t newpte;
>>> + void *addr;
>>> +
>>> + VM_BUG_ON_PAGE(PageCompound(page), page);
>>> + VM_BUG_ON_PAGE(!PageAnon(page), page);
>>> + VM_BUG_ON_PAGE(!PageLocked(page), page);
>>> + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>>> +
>>> + if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags &
>>> VM_LOCKED) ||
>>> + mm_forbids_zeropage(pvmw->vma->vm_mm))
>>> + return false;
>>> +
>>> + /*
>>> + * The pmd entry mapping the old thp was flushed and the pte
>>> mapping
>>> + * this subpage has been non present. If the subpage is only
>>> zero-filled
>>> + * then map it to the shared zeropage.
>>> + */
>>> + addr = kmap_local_page(page);
>>> + contains_data = memchr_inv(addr, 0, PAGE_SIZE);
>>> + kunmap_local(addr);
>>> +
>>> + if (contains_data)
>>> + return false;
>>> +
>>> + newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
>>> + pvmw->vma->vm_page_prot));
>>> + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte,
>>> newpte);
>>> +
>>> + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
>>> + return true;
>>> +}
>>> +
>>> +struct rmap_walk_arg {
>>> + struct folio *folio;
>>> + bool map_unused_to_zeropage;
>>> +};
>>> +
>>> /*
>>> * Restore a potential migration pte to a working pte entry
>>> */
>>> static bool remove_migration_pte(struct folio *folio,
>>> - struct vm_area_struct *vma, unsigned long addr, void
>>> *old)
>>> + struct vm_area_struct *vma, unsigned long addr, void
>>> *arg)
>>> {
>>> - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC |
>>> PVMW_MIGRATION);
>>> + struct rmap_walk_arg *rmap_walk_arg = arg;
>>> + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr,
>>> PVMW_SYNC | PVMW_MIGRATION);
>>> while (page_vma_mapped_walk(&pvmw)) {
>>> rmap_t rmap_flags = RMAP_NONE;
>>> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio
>>> *folio,
>>> continue;
>>> }
>>> #endif
>>> + if (rmap_walk_arg->map_unused_to_zeropage &&
>>> + try_to_map_unused_to_zeropage(&pvmw, folio,
>>> idx))
>>> + continue;
>>> folio_get(folio);
>>> pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
>>> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio
>>> *folio,
>>> * Get rid of all migration entries and replace them by
>>> * references to the indicated page.
>>> */
>>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>>> bool locked)
>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>>> flags)
>>> {
>>> + struct rmap_walk_arg rmap_walk_arg = {
>>> + .folio = src,
>>> + .map_unused_to_zeropage = flags &
>>> RMP_USE_SHARED_ZEROPAGE,
>>> + };
>>> +
>>> struct rmap_walk_control rwc = {
>>> .rmap_one = remove_migration_pte,
>>> - .arg = src,
>>> + .arg = &rmap_walk_arg,
>>> };
>>> - if (locked)
>>> + VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src !=
>>> dst), src);
>>> +
>>> + if (flags & RMP_LOCKED)
>>> rmap_walk_locked(dst, &rwc);
>>> else
>>> rmap_walk(dst, &rwc);
>>> @@ -934,7 +988,7 @@ static int writeout(struct address_space
>>> *mapping, struct folio *folio)
>>> * At this point we know that the migration attempt cannot
>>> * be successful.
>>> */
>>> - remove_migration_ptes(folio, folio, false);
>>> + remove_migration_ptes(folio, folio, 0);
>>> rc = mapping->a_ops->writepage(&folio->page, &wbc);
>>> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio
>>> *src,
>>> struct list_head *ret)
>>> {
>>> if (page_was_mapped)
>>> - remove_migration_ptes(src, src, false);
>>> + remove_migration_ptes(src, src, 0);
>>> /* Drop an anon_vma reference if we took one */
>>> if (anon_vma)
>>> put_anon_vma(anon_vma);
>>> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t
>>> put_new_folio, unsigned long private,
>>> lru_add_drain();
>>> if (old_page_state & PAGE_WAS_MAPPED)
>>> - remove_migration_ptes(src, dst, false);
>>> + remove_migration_ptes(src, dst, 0);
>>> out_unlock_both:
>>> folio_unlock(dst);
>>> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t
>>> get_new_folio,
>>> if (page_was_mapped)
>>> remove_migration_ptes(src,
>>> - rc == MIGRATEPAGE_SUCCESS ? dst : src,
>>> false);
>>> + rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>>> unlock_put_anon:
>>> folio_unlock(dst);
>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>> index 8d687de88a03..9cf26592ac93 100644
>>> --- a/mm/migrate_device.c
>>> +++ b/mm/migrate_device.c
>>> @@ -424,7 +424,7 @@ static unsigned long
>>> migrate_device_unmap(unsigned long *src_pfns,
>>> continue;
>>> folio = page_folio(page);
>>> - remove_migration_ptes(folio, folio, false);
>>> + remove_migration_ptes(folio, folio, 0);
>>> src_pfns[i] = 0;
>>> folio_unlock(folio);
>>> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long
>>> *src_pfns,
>>> dst = src;
>>> }
>>> - remove_migration_ptes(src, dst, false);
>>> + remove_migration_ptes(src, dst, 0);
>>> folio_unlock(src);
>>> if (folio_is_zone_device(src))
>>
>> Hi,
>>
>> This patch has been in the mainline for some time, but we recently
>> discovered an issue when both mTHP and MTE (Memory Tagging Extension)
>> are enabled.
>>
>> It seems that remapping to the same zeropage might causes MTE tag
>> mismatches, since MTE tags are associated with physical addresses.
>
> Does this only trigger when the VMA has mte enabled? Maybe we'll have to bail out if we detect that mte is enabled.
>
I believe MTE is all or nothing? i.e. all the memory is tagged when enabled, but will let the arm folks confirm.
Yeah unfortunately I think that might be the only way. We cant change the pointers and I dont think there is a way
to mark the memory as "untagged". If we cant remap to zeropage, then there is no point of shrinker. I am guessing
instead of checking at runtime whether mte is enabled when remapping to shared zeropage, we need to ifndef the
shrinker if CONFIG_ARM64_MTE is enabled?
> Also, I wonder how KSM and the shared zeropage works in general with that, because I would expect similar issues when we de-duplicate memory?
>
Yeah thats a very good point!
Also the initial report mentioned mTHP instead of THP, but I dont think that matters.
On 18.09.25 13:42, Usama Arif wrote:
>
>
> On 18/09/2025 09:56, David Hildenbrand wrote:
>> On 18.09.25 10:53, Qun-wei Lin (林群崴) wrote:
>>> On Fri, 2024-08-30 at 11:03 +0100, Usama Arif wrote:
>>>> From: Yu Zhao <yuzhao@google.com>
>>>>
>>>> Here being unused means containing only zeros and inaccessible to
>>>> userspace. When splitting an isolated thp under reclaim or migration,
>>>> the unused subpages can be mapped to the shared zeropage, hence
>>>> saving
>>>> memory. This is particularly helpful when the internal
>>>> fragmentation of a thp is high, i.e. it has many untouched subpages.
>>>>
>>>> This is also a prerequisite for THP low utilization shrinker which
>>>> will
>>>> be introduced in later patches, where underutilized THPs are split,
>>>> and
>>>> the zero-filled pages are freed saving memory.
>>>>
>>>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>>>> Tested-by: Shuang Zhai <zhais@google.com>
>>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>>> ---
>>>> include/linux/rmap.h | 7 ++++-
>>>> mm/huge_memory.c | 8 ++---
>>>> mm/migrate.c | 72 ++++++++++++++++++++++++++++++++++++++----
>>>> --
>>>> mm/migrate_device.c | 4 +--
>>>> 4 files changed, 75 insertions(+), 16 deletions(-)
>>>>
>>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>>> index 91b5935e8485..d5e93e44322e 100644
>>>> --- a/include/linux/rmap.h
>>>> +++ b/include/linux/rmap.h
>>>> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>>>> int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages,
>>>> pgoff_t pgoff,
>>>> struct vm_area_struct *vma);
>>>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>>>> bool locked);
>>>> +enum rmp_flags {
>>>> + RMP_LOCKED = 1 << 0,
>>>> + RMP_USE_SHARED_ZEROPAGE = 1 << 1,
>>>> +};
>>>> +
>>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>>>> flags);
>>>> /*
>>>> * rmap_walk_control: To control rmap traversing for specific needs
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 0c48806ccb9a..af60684e7c70 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct
>>>> vm_area_struct *vma, unsigned long addr,
>>>> return false;
>>>> }
>>>> -static void remap_page(struct folio *folio, unsigned long nr)
>>>> +static void remap_page(struct folio *folio, unsigned long nr, int
>>>> flags)
>>>> {
>>>> int i = 0;
>>>> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio,
>>>> unsigned long nr)
>>>> if (!folio_test_anon(folio))
>>>> return;
>>>> for (;;) {
>>>> - remove_migration_ptes(folio, folio, true);
>>>> + remove_migration_ptes(folio, folio, RMP_LOCKED |
>>>> flags);
>>>> i += folio_nr_pages(folio);
>>>> if (i >= nr)
>>>> break;
>>>> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page
>>>> *page, struct list_head *list,
>>>> if (nr_dropped)
>>>> shmem_uncharge(folio->mapping->host, nr_dropped);
>>>> - remap_page(folio, nr);
>>>> + remap_page(folio, nr, PageAnon(head) ?
>>>> RMP_USE_SHARED_ZEROPAGE : 0);
>>>> /*
>>>> * set page to its compound_head when split to non order-0
>>>> pages, so
>>>> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct
>>>> page *page, struct list_head *list,
>>>> if (mapping)
>>>> xas_unlock(&xas);
>>>> local_irq_enable();
>>>> - remap_page(folio, folio_nr_pages(folio));
>>>> + remap_page(folio, folio_nr_pages(folio), 0);
>>>> ret = -EAGAIN;
>>>> }
>>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>>> index 6f9c62c746be..d039863e014b 100644
>>>> --- a/mm/migrate.c
>>>> +++ b/mm/migrate.c
>>>> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio,
>>>> struct list_head *list)
>>>> return true;
>>>> }
>>>> +static bool try_to_map_unused_to_zeropage(struct
>>>> page_vma_mapped_walk *pvmw,
>>>> + struct folio *folio,
>>>> + unsigned long idx)
>>>> +{
>>>> + struct page *page = folio_page(folio, idx);
>>>> + bool contains_data;
>>>> + pte_t newpte;
>>>> + void *addr;
>>>> +
>>>> + VM_BUG_ON_PAGE(PageCompound(page), page);
>>>> + VM_BUG_ON_PAGE(!PageAnon(page), page);
>>>> + VM_BUG_ON_PAGE(!PageLocked(page), page);
>>>> + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>>>> +
>>>> + if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags &
>>>> VM_LOCKED) ||
>>>> + mm_forbids_zeropage(pvmw->vma->vm_mm))
>>>> + return false;
>>>> +
>>>> + /*
>>>> + * The pmd entry mapping the old thp was flushed and the pte
>>>> mapping
>>>> + * this subpage has been non present. If the subpage is only
>>>> zero-filled
>>>> + * then map it to the shared zeropage.
>>>> + */
>>>> + addr = kmap_local_page(page);
>>>> + contains_data = memchr_inv(addr, 0, PAGE_SIZE);
>>>> + kunmap_local(addr);
>>>> +
>>>> + if (contains_data)
>>>> + return false;
>>>> +
>>>> + newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
>>>> + pvmw->vma->vm_page_prot));
>>>> + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte,
>>>> newpte);
>>>> +
>>>> + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
>>>> + return true;
>>>> +}
>>>> +
>>>> +struct rmap_walk_arg {
>>>> + struct folio *folio;
>>>> + bool map_unused_to_zeropage;
>>>> +};
>>>> +
>>>> /*
>>>> * Restore a potential migration pte to a working pte entry
>>>> */
>>>> static bool remove_migration_pte(struct folio *folio,
>>>> - struct vm_area_struct *vma, unsigned long addr, void
>>>> *old)
>>>> + struct vm_area_struct *vma, unsigned long addr, void
>>>> *arg)
>>>> {
>>>> - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC |
>>>> PVMW_MIGRATION);
>>>> + struct rmap_walk_arg *rmap_walk_arg = arg;
>>>> + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr,
>>>> PVMW_SYNC | PVMW_MIGRATION);
>>>> while (page_vma_mapped_walk(&pvmw)) {
>>>> rmap_t rmap_flags = RMAP_NONE;
>>>> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio
>>>> *folio,
>>>> continue;
>>>> }
>>>> #endif
>>>> + if (rmap_walk_arg->map_unused_to_zeropage &&
>>>> + try_to_map_unused_to_zeropage(&pvmw, folio,
>>>> idx))
>>>> + continue;
>>>> folio_get(folio);
>>>> pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
>>>> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio
>>>> *folio,
>>>> * Get rid of all migration entries and replace them by
>>>> * references to the indicated page.
>>>> */
>>>> -void remove_migration_ptes(struct folio *src, struct folio *dst,
>>>> bool locked)
>>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int
>>>> flags)
>>>> {
>>>> + struct rmap_walk_arg rmap_walk_arg = {
>>>> + .folio = src,
>>>> + .map_unused_to_zeropage = flags &
>>>> RMP_USE_SHARED_ZEROPAGE,
>>>> + };
>>>> +
>>>> struct rmap_walk_control rwc = {
>>>> .rmap_one = remove_migration_pte,
>>>> - .arg = src,
>>>> + .arg = &rmap_walk_arg,
>>>> };
>>>> - if (locked)
>>>> + VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src !=
>>>> dst), src);
>>>> +
>>>> + if (flags & RMP_LOCKED)
>>>> rmap_walk_locked(dst, &rwc);
>>>> else
>>>> rmap_walk(dst, &rwc);
>>>> @@ -934,7 +988,7 @@ static int writeout(struct address_space
>>>> *mapping, struct folio *folio)
>>>> * At this point we know that the migration attempt cannot
>>>> * be successful.
>>>> */
>>>> - remove_migration_ptes(folio, folio, false);
>>>> + remove_migration_ptes(folio, folio, 0);
>>>> rc = mapping->a_ops->writepage(&folio->page, &wbc);
>>>> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio
>>>> *src,
>>>> struct list_head *ret)
>>>> {
>>>> if (page_was_mapped)
>>>> - remove_migration_ptes(src, src, false);
>>>> + remove_migration_ptes(src, src, 0);
>>>> /* Drop an anon_vma reference if we took one */
>>>> if (anon_vma)
>>>> put_anon_vma(anon_vma);
>>>> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t
>>>> put_new_folio, unsigned long private,
>>>> lru_add_drain();
>>>> if (old_page_state & PAGE_WAS_MAPPED)
>>>> - remove_migration_ptes(src, dst, false);
>>>> + remove_migration_ptes(src, dst, 0);
>>>> out_unlock_both:
>>>> folio_unlock(dst);
>>>> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t
>>>> get_new_folio,
>>>> if (page_was_mapped)
>>>> remove_migration_ptes(src,
>>>> - rc == MIGRATEPAGE_SUCCESS ? dst : src,
>>>> false);
>>>> + rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>>>> unlock_put_anon:
>>>> folio_unlock(dst);
>>>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>>>> index 8d687de88a03..9cf26592ac93 100644
>>>> --- a/mm/migrate_device.c
>>>> +++ b/mm/migrate_device.c
>>>> @@ -424,7 +424,7 @@ static unsigned long
>>>> migrate_device_unmap(unsigned long *src_pfns,
>>>> continue;
>>>> folio = page_folio(page);
>>>> - remove_migration_ptes(folio, folio, false);
>>>> + remove_migration_ptes(folio, folio, 0);
>>>> src_pfns[i] = 0;
>>>> folio_unlock(folio);
>>>> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long
>>>> *src_pfns,
>>>> dst = src;
>>>> }
>>>> - remove_migration_ptes(src, dst, false);
>>>> + remove_migration_ptes(src, dst, 0);
>>>> folio_unlock(src);
>>>> if (folio_is_zone_device(src))
>>>
>>> Hi,
>>>
>>> This patch has been in the mainline for some time, but we recently
>>> discovered an issue when both mTHP and MTE (Memory Tagging Extension)
>>> are enabled.
>>>
>>> It seems that remapping to the same zeropage might causes MTE tag
>>> mismatches, since MTE tags are associated with physical addresses.
>>
>> Does this only trigger when the VMA has mte enabled? Maybe we'll have to bail out if we detect that mte is enabled.
>>
>
> I believe MTE is all or nothing? i.e. all the memory is tagged when enabled, but will let the arm folks confirm.
>
I think for a process it's only active when PROT_MTE was called.
In khugepaged, I recall that we use
copy_mc_user_highpage()->copy_user_highpage() that translates on arm64
to its custom copy_highpage() where tags are copied as well.
So that's why khugepaged works.
KSM? Hmmm. Not sure how that is fenced off. We'd likely need some hidden
page_mte_tagged() check somewhere.
--
Cheers
David / dhildenb
On 30 Aug 2024, at 6:03, Usama Arif wrote:
> From: Yu Zhao <yuzhao@google.com>
>
> Here being unused means containing only zeros and inaccessible to
> userspace. When splitting an isolated thp under reclaim or migration,
> the unused subpages can be mapped to the shared zeropage, hence saving
> memory. This is particularly helpful when the internal
> fragmentation of a thp is high, i.e. it has many untouched subpages.
>
> This is also a prerequisite for THP low utilization shrinker which will
> be introduced in later patches, where underutilized THPs are split, and
> the zero-filled pages are freed saving memory.
>
> Signed-off-by: Yu Zhao <yuzhao@google.com>
> Tested-by: Shuang Zhai <zhais@google.com>
> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> ---
> include/linux/rmap.h | 7 ++++-
> mm/huge_memory.c | 8 ++---
> mm/migrate.c | 72 ++++++++++++++++++++++++++++++++++++++------
> mm/migrate_device.c | 4 +--
> 4 files changed, 75 insertions(+), 16 deletions(-)
>
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 91b5935e8485..d5e93e44322e 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
> int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
> struct vm_area_struct *vma);
>
> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
> +enum rmp_flags {
> + RMP_LOCKED = 1 << 0,
> + RMP_USE_SHARED_ZEROPAGE = 1 << 1,
> +};
> +
> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
>
> /*
> * rmap_walk_control: To control rmap traversing for specific needs
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 0c48806ccb9a..af60684e7c70 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
> return false;
> }
>
> -static void remap_page(struct folio *folio, unsigned long nr)
> +static void remap_page(struct folio *folio, unsigned long nr, int flags)
> {
> int i = 0;
>
> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
> if (!folio_test_anon(folio))
> return;
> for (;;) {
> - remove_migration_ptes(folio, folio, true);
> + remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
> i += folio_nr_pages(folio);
> if (i >= nr)
> break;
> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>
> if (nr_dropped)
> shmem_uncharge(folio->mapping->host, nr_dropped);
> - remap_page(folio, nr);
> + remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
>
> /*
> * set page to its compound_head when split to non order-0 pages, so
> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> if (mapping)
> xas_unlock(&xas);
> local_irq_enable();
> - remap_page(folio, folio_nr_pages(folio));
> + remap_page(folio, folio_nr_pages(folio), 0);
> ret = -EAGAIN;
> }
>
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 6f9c62c746be..d039863e014b 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
> return true;
> }
>
> +static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
> + struct folio *folio,
> + unsigned long idx)
> +{
> + struct page *page = folio_page(folio, idx);
> + bool contains_data;
> + pte_t newpte;
> + void *addr;
> +
> + VM_BUG_ON_PAGE(PageCompound(page), page);
This should be:
diff --git a/mm/migrate.c b/mm/migrate.c
index e950fd62607f..7ffdbe078aa7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -206,7 +206,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
pte_t newpte;
void *addr;
- VM_BUG_ON_PAGE(PageCompound(page), page);
+ if (PageCompound(page))
+ return false;
VM_BUG_ON_PAGE(!PageAnon(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
Otherwise, splitting anonymous large folios to non order-0 ones just
triggers this BUG_ON.
> + VM_BUG_ON_PAGE(!PageAnon(page), page);
> + VM_BUG_ON_PAGE(!PageLocked(page), page);
> + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
> +
> + if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
> + mm_forbids_zeropage(pvmw->vma->vm_mm))
> + return false;
> +
> + /*
> + * The pmd entry mapping the old thp was flushed and the pte mapping
> + * this subpage has been non present. If the subpage is only zero-filled
> + * then map it to the shared zeropage.
> + */
> + addr = kmap_local_page(page);
> + contains_data = memchr_inv(addr, 0, PAGE_SIZE);
> + kunmap_local(addr);
> +
> + if (contains_data)
> + return false;
> +
> + newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
> + pvmw->vma->vm_page_prot));
> + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
> +
> + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
> + return true;
> +}
> +
> +struct rmap_walk_arg {
> + struct folio *folio;
> + bool map_unused_to_zeropage;
> +};
> +
> /*
> * Restore a potential migration pte to a working pte entry
> */
> static bool remove_migration_pte(struct folio *folio,
> - struct vm_area_struct *vma, unsigned long addr, void *old)
> + struct vm_area_struct *vma, unsigned long addr, void *arg)
> {
> - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
> + struct rmap_walk_arg *rmap_walk_arg = arg;
> + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
>
> while (page_vma_mapped_walk(&pvmw)) {
> rmap_t rmap_flags = RMAP_NONE;
> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio *folio,
> continue;
> }
> #endif
> + if (rmap_walk_arg->map_unused_to_zeropage &&
> + try_to_map_unused_to_zeropage(&pvmw, folio, idx))
> + continue;
>
> folio_get(folio);
> pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio *folio,
> * Get rid of all migration entries and replace them by
> * references to the indicated page.
> */
> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
> {
> + struct rmap_walk_arg rmap_walk_arg = {
> + .folio = src,
> + .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
> + };
> +
> struct rmap_walk_control rwc = {
> .rmap_one = remove_migration_pte,
> - .arg = src,
> + .arg = &rmap_walk_arg,
> };
>
> - if (locked)
> + VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
> +
> + if (flags & RMP_LOCKED)
> rmap_walk_locked(dst, &rwc);
> else
> rmap_walk(dst, &rwc);
> @@ -934,7 +988,7 @@ static int writeout(struct address_space *mapping, struct folio *folio)
> * At this point we know that the migration attempt cannot
> * be successful.
> */
> - remove_migration_ptes(folio, folio, false);
> + remove_migration_ptes(folio, folio, 0);
>
> rc = mapping->a_ops->writepage(&folio->page, &wbc);
>
> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio *src,
> struct list_head *ret)
> {
> if (page_was_mapped)
> - remove_migration_ptes(src, src, false);
> + remove_migration_ptes(src, src, 0);
> /* Drop an anon_vma reference if we took one */
> if (anon_vma)
> put_anon_vma(anon_vma);
> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
> lru_add_drain();
>
> if (old_page_state & PAGE_WAS_MAPPED)
> - remove_migration_ptes(src, dst, false);
> + remove_migration_ptes(src, dst, 0);
>
> out_unlock_both:
> folio_unlock(dst);
> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
>
> if (page_was_mapped)
> remove_migration_ptes(src,
> - rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
> + rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>
> unlock_put_anon:
> folio_unlock(dst);
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index 8d687de88a03..9cf26592ac93 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -424,7 +424,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
> continue;
>
> folio = page_folio(page);
> - remove_migration_ptes(folio, folio, false);
> + remove_migration_ptes(folio, folio, 0);
>
> src_pfns[i] = 0;
> folio_unlock(folio);
> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
> dst = src;
> }
>
> - remove_migration_ptes(src, dst, false);
> + remove_migration_ptes(src, dst, 0);
> folio_unlock(src);
>
> if (folio_is_zone_device(src))
> --
> 2.43.5
Best Regards,
Yan, Zi
On 23/10/2024 17:21, Zi Yan wrote:
> On 30 Aug 2024, at 6:03, Usama Arif wrote:
>
>> From: Yu Zhao <yuzhao@google.com>
>>
>> Here being unused means containing only zeros and inaccessible to
>> userspace. When splitting an isolated thp under reclaim or migration,
>> the unused subpages can be mapped to the shared zeropage, hence saving
>> memory. This is particularly helpful when the internal
>> fragmentation of a thp is high, i.e. it has many untouched subpages.
>>
>> This is also a prerequisite for THP low utilization shrinker which will
>> be introduced in later patches, where underutilized THPs are split, and
>> the zero-filled pages are freed saving memory.
>>
>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>> Tested-by: Shuang Zhai <zhais@google.com>
>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>> ---
>> include/linux/rmap.h | 7 ++++-
>> mm/huge_memory.c | 8 ++---
>> mm/migrate.c | 72 ++++++++++++++++++++++++++++++++++++++------
>> mm/migrate_device.c | 4 +--
>> 4 files changed, 75 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>> index 91b5935e8485..d5e93e44322e 100644
>> --- a/include/linux/rmap.h
>> +++ b/include/linux/rmap.h
>> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>> int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
>> struct vm_area_struct *vma);
>>
>> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
>> +enum rmp_flags {
>> + RMP_LOCKED = 1 << 0,
>> + RMP_USE_SHARED_ZEROPAGE = 1 << 1,
>> +};
>> +
>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
>>
>> /*
>> * rmap_walk_control: To control rmap traversing for specific needs
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 0c48806ccb9a..af60684e7c70 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
>> return false;
>> }
>>
>> -static void remap_page(struct folio *folio, unsigned long nr)
>> +static void remap_page(struct folio *folio, unsigned long nr, int flags)
>> {
>> int i = 0;
>>
>> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
>> if (!folio_test_anon(folio))
>> return;
>> for (;;) {
>> - remove_migration_ptes(folio, folio, true);
>> + remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
>> i += folio_nr_pages(folio);
>> if (i >= nr)
>> break;
>> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>
>> if (nr_dropped)
>> shmem_uncharge(folio->mapping->host, nr_dropped);
>> - remap_page(folio, nr);
>> + remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
>>
>> /*
>> * set page to its compound_head when split to non order-0 pages, so
>> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>> if (mapping)
>> xas_unlock(&xas);
>> local_irq_enable();
>> - remap_page(folio, folio_nr_pages(folio));
>> + remap_page(folio, folio_nr_pages(folio), 0);
>> ret = -EAGAIN;
>> }
>>
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index 6f9c62c746be..d039863e014b 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
>> return true;
>> }
>>
>> +static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
>> + struct folio *folio,
>> + unsigned long idx)
>> +{
>> + struct page *page = folio_page(folio, idx);
>> + bool contains_data;
>> + pte_t newpte;
>> + void *addr;
>> +
>> + VM_BUG_ON_PAGE(PageCompound(page), page);
>
> This should be:
>
> diff --git a/mm/migrate.c b/mm/migrate.c
> index e950fd62607f..7ffdbe078aa7 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -206,7 +206,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
> pte_t newpte;
> void *addr;
>
> - VM_BUG_ON_PAGE(PageCompound(page), page);
> + if (PageCompound(page))
> + return false;
> VM_BUG_ON_PAGE(!PageAnon(page), page);
> VM_BUG_ON_PAGE(!PageLocked(page), page);
> VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>
> Otherwise, splitting anonymous large folios to non order-0 ones just
> triggers this BUG_ON.
>
That makes sense, would you like to send the fix?
Adding Yu Zhao to "To" incase he has any objections.
Thanks,
Usama
>> + VM_BUG_ON_PAGE(!PageAnon(page), page);
>> + VM_BUG_ON_PAGE(!PageLocked(page), page);
>> + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>> +
>> + if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
>> + mm_forbids_zeropage(pvmw->vma->vm_mm))
>> + return false;
>> +
>> + /*
>> + * The pmd entry mapping the old thp was flushed and the pte mapping
>> + * this subpage has been non present. If the subpage is only zero-filled
>> + * then map it to the shared zeropage.
>> + */
>> + addr = kmap_local_page(page);
>> + contains_data = memchr_inv(addr, 0, PAGE_SIZE);
>> + kunmap_local(addr);
>> +
>> + if (contains_data)
>> + return false;
>> +
>> + newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
>> + pvmw->vma->vm_page_prot));
>> + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
>> +
>> + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
>> + return true;
>> +}
>> +
>> +struct rmap_walk_arg {
>> + struct folio *folio;
>> + bool map_unused_to_zeropage;
>> +};
>> +
>> /*
>> * Restore a potential migration pte to a working pte entry
>> */
>> static bool remove_migration_pte(struct folio *folio,
>> - struct vm_area_struct *vma, unsigned long addr, void *old)
>> + struct vm_area_struct *vma, unsigned long addr, void *arg)
>> {
>> - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
>> + struct rmap_walk_arg *rmap_walk_arg = arg;
>> + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
>>
>> while (page_vma_mapped_walk(&pvmw)) {
>> rmap_t rmap_flags = RMAP_NONE;
>> @@ -234,6 +278,9 @@ static bool remove_migration_pte(struct folio *folio,
>> continue;
>> }
>> #endif
>> + if (rmap_walk_arg->map_unused_to_zeropage &&
>> + try_to_map_unused_to_zeropage(&pvmw, folio, idx))
>> + continue;
>>
>> folio_get(folio);
>> pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
>> @@ -312,14 +359,21 @@ static bool remove_migration_pte(struct folio *folio,
>> * Get rid of all migration entries and replace them by
>> * references to the indicated page.
>> */
>> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
>> {
>> + struct rmap_walk_arg rmap_walk_arg = {
>> + .folio = src,
>> + .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
>> + };
>> +
>> struct rmap_walk_control rwc = {
>> .rmap_one = remove_migration_pte,
>> - .arg = src,
>> + .arg = &rmap_walk_arg,
>> };
>>
>> - if (locked)
>> + VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
>> +
>> + if (flags & RMP_LOCKED)
>> rmap_walk_locked(dst, &rwc);
>> else
>> rmap_walk(dst, &rwc);
>> @@ -934,7 +988,7 @@ static int writeout(struct address_space *mapping, struct folio *folio)
>> * At this point we know that the migration attempt cannot
>> * be successful.
>> */
>> - remove_migration_ptes(folio, folio, false);
>> + remove_migration_ptes(folio, folio, 0);
>>
>> rc = mapping->a_ops->writepage(&folio->page, &wbc);
>>
>> @@ -1098,7 +1152,7 @@ static void migrate_folio_undo_src(struct folio *src,
>> struct list_head *ret)
>> {
>> if (page_was_mapped)
>> - remove_migration_ptes(src, src, false);
>> + remove_migration_ptes(src, src, 0);
>> /* Drop an anon_vma reference if we took one */
>> if (anon_vma)
>> put_anon_vma(anon_vma);
>> @@ -1336,7 +1390,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
>> lru_add_drain();
>>
>> if (old_page_state & PAGE_WAS_MAPPED)
>> - remove_migration_ptes(src, dst, false);
>> + remove_migration_ptes(src, dst, 0);
>>
>> out_unlock_both:
>> folio_unlock(dst);
>> @@ -1474,7 +1528,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
>>
>> if (page_was_mapped)
>> remove_migration_ptes(src,
>> - rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
>> + rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
>>
>> unlock_put_anon:
>> folio_unlock(dst);
>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> index 8d687de88a03..9cf26592ac93 100644
>> --- a/mm/migrate_device.c
>> +++ b/mm/migrate_device.c
>> @@ -424,7 +424,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
>> continue;
>>
>> folio = page_folio(page);
>> - remove_migration_ptes(folio, folio, false);
>> + remove_migration_ptes(folio, folio, 0);
>>
>> src_pfns[i] = 0;
>> folio_unlock(folio);
>> @@ -840,7 +840,7 @@ void migrate_device_finalize(unsigned long *src_pfns,
>> dst = src;
>> }
>>
>> - remove_migration_ptes(src, dst, false);
>> + remove_migration_ptes(src, dst, 0);
>> folio_unlock(src);
>>
>> if (folio_is_zone_device(src))
>> --
>> 2.43.5
>
> Best Regards,
> Yan, Zi
On Wed, Oct 23, 2024 at 10:51 AM Usama Arif <usamaarif642@gmail.com> wrote:
>
> On 23/10/2024 17:21, Zi Yan wrote:
> > On 30 Aug 2024, at 6:03, Usama Arif wrote:
> >
> >> From: Yu Zhao <yuzhao@google.com>
> >>
> >> Here being unused means containing only zeros and inaccessible to
> >> userspace. When splitting an isolated thp under reclaim or migration,
> >> the unused subpages can be mapped to the shared zeropage, hence saving
> >> memory. This is particularly helpful when the internal
> >> fragmentation of a thp is high, i.e. it has many untouched subpages.
> >>
> >> This is also a prerequisite for THP low utilization shrinker which will
> >> be introduced in later patches, where underutilized THPs are split, and
> >> the zero-filled pages are freed saving memory.
> >>
> >> Signed-off-by: Yu Zhao <yuzhao@google.com>
> >> Tested-by: Shuang Zhai <zhais@google.com>
> >> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
> >> ---
> >> include/linux/rmap.h | 7 ++++-
> >> mm/huge_memory.c | 8 ++---
> >> mm/migrate.c | 72 ++++++++++++++++++++++++++++++++++++++------
> >> mm/migrate_device.c | 4 +--
> >> 4 files changed, 75 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> >> index 91b5935e8485..d5e93e44322e 100644
> >> --- a/include/linux/rmap.h
> >> +++ b/include/linux/rmap.h
> >> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
> >> int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
> >> struct vm_area_struct *vma);
> >>
> >> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
> >> +enum rmp_flags {
> >> + RMP_LOCKED = 1 << 0,
> >> + RMP_USE_SHARED_ZEROPAGE = 1 << 1,
> >> +};
> >> +
> >> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
> >>
> >> /*
> >> * rmap_walk_control: To control rmap traversing for specific needs
> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> >> index 0c48806ccb9a..af60684e7c70 100644
> >> --- a/mm/huge_memory.c
> >> +++ b/mm/huge_memory.c
> >> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
> >> return false;
> >> }
> >>
> >> -static void remap_page(struct folio *folio, unsigned long nr)
> >> +static void remap_page(struct folio *folio, unsigned long nr, int flags)
> >> {
> >> int i = 0;
> >>
> >> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
> >> if (!folio_test_anon(folio))
> >> return;
> >> for (;;) {
> >> - remove_migration_ptes(folio, folio, true);
> >> + remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
> >> i += folio_nr_pages(folio);
> >> if (i >= nr)
> >> break;
> >> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
> >>
> >> if (nr_dropped)
> >> shmem_uncharge(folio->mapping->host, nr_dropped);
> >> - remap_page(folio, nr);
> >> + remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
> >>
> >> /*
> >> * set page to its compound_head when split to non order-0 pages, so
> >> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> >> if (mapping)
> >> xas_unlock(&xas);
> >> local_irq_enable();
> >> - remap_page(folio, folio_nr_pages(folio));
> >> + remap_page(folio, folio_nr_pages(folio), 0);
> >> ret = -EAGAIN;
> >> }
> >>
> >> diff --git a/mm/migrate.c b/mm/migrate.c
> >> index 6f9c62c746be..d039863e014b 100644
> >> --- a/mm/migrate.c
> >> +++ b/mm/migrate.c
> >> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
> >> return true;
> >> }
> >>
> >> +static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
> >> + struct folio *folio,
> >> + unsigned long idx)
> >> +{
> >> + struct page *page = folio_page(folio, idx);
> >> + bool contains_data;
> >> + pte_t newpte;
> >> + void *addr;
> >> +
> >> + VM_BUG_ON_PAGE(PageCompound(page), page);
> >
> > This should be:
> >
> > diff --git a/mm/migrate.c b/mm/migrate.c
> > index e950fd62607f..7ffdbe078aa7 100644
> > --- a/mm/migrate.c
> > +++ b/mm/migrate.c
> > @@ -206,7 +206,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
> > pte_t newpte;
> > void *addr;
> >
> > - VM_BUG_ON_PAGE(PageCompound(page), page);
> > + if (PageCompound(page))
> > + return false;
> > VM_BUG_ON_PAGE(!PageAnon(page), page);
> > VM_BUG_ON_PAGE(!PageLocked(page), page);
> > VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
> >
> > Otherwise, splitting anonymous large folios to non order-0 ones just
> > triggers this BUG_ON.
> >
>
> That makes sense, would you like to send the fix?
>
> Adding Yu Zhao to "To" incase he has any objections.
LGTM. Thanks!
On 23 Oct 2024, at 12:50, Usama Arif wrote:
> On 23/10/2024 17:21, Zi Yan wrote:
>> On 30 Aug 2024, at 6:03, Usama Arif wrote:
>>
>>> From: Yu Zhao <yuzhao@google.com>
>>>
>>> Here being unused means containing only zeros and inaccessible to
>>> userspace. When splitting an isolated thp under reclaim or migration,
>>> the unused subpages can be mapped to the shared zeropage, hence saving
>>> memory. This is particularly helpful when the internal
>>> fragmentation of a thp is high, i.e. it has many untouched subpages.
>>>
>>> This is also a prerequisite for THP low utilization shrinker which will
>>> be introduced in later patches, where underutilized THPs are split, and
>>> the zero-filled pages are freed saving memory.
>>>
>>> Signed-off-by: Yu Zhao <yuzhao@google.com>
>>> Tested-by: Shuang Zhai <zhais@google.com>
>>> Signed-off-by: Usama Arif <usamaarif642@gmail.com>
>>> ---
>>> include/linux/rmap.h | 7 ++++-
>>> mm/huge_memory.c | 8 ++---
>>> mm/migrate.c | 72 ++++++++++++++++++++++++++++++++++++++------
>>> mm/migrate_device.c | 4 +--
>>> 4 files changed, 75 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>> index 91b5935e8485..d5e93e44322e 100644
>>> --- a/include/linux/rmap.h
>>> +++ b/include/linux/rmap.h
>>> @@ -745,7 +745,12 @@ int folio_mkclean(struct folio *);
>>> int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
>>> struct vm_area_struct *vma);
>>>
>>> -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
>>> +enum rmp_flags {
>>> + RMP_LOCKED = 1 << 0,
>>> + RMP_USE_SHARED_ZEROPAGE = 1 << 1,
>>> +};
>>> +
>>> +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);
>>>
>>> /*
>>> * rmap_walk_control: To control rmap traversing for specific needs
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 0c48806ccb9a..af60684e7c70 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -3020,7 +3020,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
>>> return false;
>>> }
>>>
>>> -static void remap_page(struct folio *folio, unsigned long nr)
>>> +static void remap_page(struct folio *folio, unsigned long nr, int flags)
>>> {
>>> int i = 0;
>>>
>>> @@ -3028,7 +3028,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
>>> if (!folio_test_anon(folio))
>>> return;
>>> for (;;) {
>>> - remove_migration_ptes(folio, folio, true);
>>> + remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
>>> i += folio_nr_pages(folio);
>>> if (i >= nr)
>>> break;
>>> @@ -3240,7 +3240,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>>>
>>> if (nr_dropped)
>>> shmem_uncharge(folio->mapping->host, nr_dropped);
>>> - remap_page(folio, nr);
>>> + remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
>>>
>>> /*
>>> * set page to its compound_head when split to non order-0 pages, so
>>> @@ -3542,7 +3542,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>> if (mapping)
>>> xas_unlock(&xas);
>>> local_irq_enable();
>>> - remap_page(folio, folio_nr_pages(folio));
>>> + remap_page(folio, folio_nr_pages(folio), 0);
>>> ret = -EAGAIN;
>>> }
>>>
>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>> index 6f9c62c746be..d039863e014b 100644
>>> --- a/mm/migrate.c
>>> +++ b/mm/migrate.c
>>> @@ -204,13 +204,57 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
>>> return true;
>>> }
>>>
>>> +static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
>>> + struct folio *folio,
>>> + unsigned long idx)
>>> +{
>>> + struct page *page = folio_page(folio, idx);
>>> + bool contains_data;
>>> + pte_t newpte;
>>> + void *addr;
>>> +
>>> + VM_BUG_ON_PAGE(PageCompound(page), page);
>>
>> This should be:
>>
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index e950fd62607f..7ffdbe078aa7 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -206,7 +206,8 @@ static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
>> pte_t newpte;
>> void *addr;
>>
>> - VM_BUG_ON_PAGE(PageCompound(page), page);
>> + if (PageCompound(page))
>> + return false;
>> VM_BUG_ON_PAGE(!PageAnon(page), page);
>> VM_BUG_ON_PAGE(!PageLocked(page), page);
>> VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
>>
>> Otherwise, splitting anonymous large folios to non order-0 ones just
>> triggers this BUG_ON.
>>
>
> That makes sense, would you like to send the fix?
>
> Adding Yu Zhao to "To" incase he has any objections.
>
Sure, will do.
Best Regards,
Yan, Zi
© 2016 - 2025 Red Hat, Inc.