[v3] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

[PATCH v3] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by Pedro Demarchi Gomes 3 months, 3 weeks ago

Currently, scan_get_next_rmap_item() walks every page address in a VMA
to locate mergeable pages. This becomes highly inefficient when scanning
large virtual memory areas that contain mostly unmapped regions.

This patch replaces the per-address lookup with a range walk using
walk_page_range(). The range walker allows KSM to skip over entire
unmapped holes in a VMA, avoiding unnecessary lookups.
This problem was previously discussed in [1].

[1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/

---

v3:
  - Treat THPs in ksm_pmd_entry
  - Update ksm_scan.address outside walk_page_range
  - Change goto to while loop

v2: https://lore.kernel.org/all/20251014151126.87589-1-pedrodemargomes@gmail.com/
  - Use pmd_entry to walk page range
  - Use cond_resched inside pmd_entry()
  - walk_page_range returns page+folio

v1: https://lore.kernel.org/all/20251014055828.124522-1-pedrodemargomes@gmail.com/

Reported-by: craftfever <craftfever@airmail.cc>
Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
Suggested-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
---
 mm/ksm.c | 185 ++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 135 insertions(+), 50 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 3aed0478fdce..403e4f102f07 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2455,14 +2455,119 @@ static bool should_skip_rmap_item(struct folio *folio,
 	return true;
 }
 
+struct ksm_walk_private {
+	struct page *page;
+	struct folio *folio;
+	struct vm_area_struct *vma;
+	unsigned long address;
+};
+
+static int ksm_walk_test(unsigned long addr, unsigned long next, struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
+	struct ksm_walk_private *private;
+
+	if (!(vma->vm_flags & VM_MERGEABLE))
+		return 1;
+
+	private = (struct ksm_walk_private *) walk->private;
+	private->address = vma->vm_end;
+
+	if (!vma->anon_vma)
+		return 1;
+
+	return 0;
+}
+
+static int ksm_pmd_entry(pmd_t *pmd, unsigned long addr,
+			    unsigned long end, struct mm_walk *walk)
+{
+	struct mm_struct *mm = walk->mm;
+	struct vm_area_struct *vma = walk->vma;
+	struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private;
+	struct folio *folio;
+	pte_t *start_pte, *pte, ptent;
+	pmd_t pmde;
+	struct page *page;
+	spinlock_t *ptl;
+	int ret = 0;
+
+	if (ksm_test_exit(mm))
+		return 1;
+
+	ptl = pmd_lock(mm, pmd);
+	pmde = pmdp_get(pmd);
+
+	if (!pmd_present(pmde))
+		goto pmd_out;
+
+	if (!pmd_trans_huge(pmde))
+		goto pte_table;
+
+	page = vm_normal_page_pmd(vma, addr, pmde);
+
+	if (!page)
+		goto pmd_out;
+
+	folio = page_folio(page);
+	if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+		goto pmd_out;
+
+	ret = 1;
+	folio_get(folio);
+	private->page = page + ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
+	private->folio = folio;
+	private->vma = vma;
+	private->address = addr;
+pmd_out:
+	spin_unlock(ptl);
+	return ret;
+
+pte_table:
+	spin_unlock(ptl);
+
+	start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	if (!start_pte)
+		return 0;
+
+	for (; addr < end; pte++, addr += PAGE_SIZE) {
+		ptent = ptep_get(pte);
+		page = vm_normal_page(vma, addr, ptent);
+
+		if (!page)
+			continue;
+
+		folio = page_folio(page);
+		if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+			continue;
+
+		ret = 1;
+		folio_get(folio);
+		private->page = page;
+		private->folio = folio;
+		private->vma = vma;
+		private->address = addr;
+		break;
+	}
+	pte_unmap_unlock(start_pte, ptl);
+
+	cond_resched();
+	return ret;
+}
+
+struct mm_walk_ops walk_ops = {
+	.pmd_entry = ksm_pmd_entry,
+	.test_walk = ksm_walk_test,
+	.walk_lock = PGWALK_RDLOCK,
+};
+
 static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
 {
 	struct mm_struct *mm;
 	struct ksm_mm_slot *mm_slot;
 	struct mm_slot *slot;
-	struct vm_area_struct *vma;
 	struct ksm_rmap_item *rmap_item;
-	struct vma_iterator vmi;
+	struct ksm_walk_private walk_private;
 	int nid;
 
 	if (list_empty(&ksm_mm_head.slot.mm_node))
@@ -2527,64 +2632,44 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
 
 	slot = &mm_slot->slot;
 	mm = slot->mm;
-	vma_iter_init(&vmi, mm, ksm_scan.address);
 
 	mmap_read_lock(mm);
 	if (ksm_test_exit(mm))
 		goto no_vmas;
 
-	for_each_vma(vmi, vma) {
-		if (!(vma->vm_flags & VM_MERGEABLE))
-			continue;
-		if (ksm_scan.address < vma->vm_start)
-			ksm_scan.address = vma->vm_start;
-		if (!vma->anon_vma)
-			ksm_scan.address = vma->vm_end;
-
-		while (ksm_scan.address < vma->vm_end) {
-			struct page *tmp_page = NULL;
-			struct folio_walk fw;
-			struct folio *folio;
+	while (true) {
+		struct folio *folio;
 
-			if (ksm_test_exit(mm))
-				break;
+		walk_private.page = NULL;
+		walk_private.folio = NULL;
+		walk_private.address = ksm_scan.address;
 
-			folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
-			if (folio) {
-				if (!folio_is_zone_device(folio) &&
-				     folio_test_anon(folio)) {
-					folio_get(folio);
-					tmp_page = fw.page;
-				}
-				folio_walk_end(&fw, vma);
-			}
+		walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
+		ksm_scan.address = walk_private.address;
+		if (!walk_private.page)
+			break;
+
+		folio = walk_private.folio;
+		flush_anon_page(walk_private.vma, walk_private.page, ksm_scan.address);
+		flush_dcache_page(walk_private.page);
+		rmap_item = get_next_rmap_item(mm_slot,
+			ksm_scan.rmap_list, ksm_scan.address);
+		if (rmap_item) {
+			ksm_scan.rmap_list =
+					&rmap_item->rmap_list;
 
-			if (tmp_page) {
-				flush_anon_page(vma, tmp_page, ksm_scan.address);
-				flush_dcache_page(tmp_page);
-				rmap_item = get_next_rmap_item(mm_slot,
-					ksm_scan.rmap_list, ksm_scan.address);
-				if (rmap_item) {
-					ksm_scan.rmap_list =
-							&rmap_item->rmap_list;
-
-					if (should_skip_rmap_item(folio, rmap_item)) {
-						folio_put(folio);
-						goto next_page;
-					}
-
-					ksm_scan.address += PAGE_SIZE;
-					*page = tmp_page;
-				} else {
-					folio_put(folio);
-				}
-				mmap_read_unlock(mm);
-				return rmap_item;
-			}
-next_page:
 			ksm_scan.address += PAGE_SIZE;
-			cond_resched();
+			if (should_skip_rmap_item(folio, rmap_item)) {
+				folio_put(folio);
+				continue;
+			}
+
+			*page = walk_private.page;
+		} else {
+			folio_put(folio);
 		}
+		mmap_read_unlock(mm);
+		return rmap_item;
 	}
 
 	if (ksm_test_exit(mm)) {
-- 
2.39.5

Re: [PATCH v3] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by David Hildenbrand 3 months, 3 weeks ago

On 16.10.25 03:22, Pedro Demarchi Gomes wrote:
> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions.
> 
> This patch replaces the per-address lookup with a range walk using
> walk_page_range(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
> This problem was previously discussed in [1].
> 
> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> 
> ---

This patch does to much in a single patch which makes it
rather hard to review.

As a first step, we should focus on leaving most of
scan_get_next_rmap_item() alone and only focus on replacing
folio_walk by walk_page_range_vma().

Follow-up cleanups could try cleaning up scan_get_next_rmap_item()
-- and boy oh boy, does that function scream for quite some cleanups.

This is something minimal based on your v3. I applied plenty of more
cleanups and I wish we could further shrink the pmd_entry function,
but I have to give up for today (well, it's already tomorrow :) ).


Briefly tested with ksm selftests and my machine did not burn down my building.


 From d971b88056fe3fefe50e5d4fa5b359e8c8331b2c Mon Sep 17 00:00:00 2001
From: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
Date: Wed, 15 Oct 2025 22:22:36 -0300
Subject: [PATCH] ksm: use range-walk function to jump over holes in
  scan_get_next_rmap_item

Currently, scan_get_next_rmap_item() walks every page address in a VMA
to locate mergeable pages. This becomes highly inefficient when scanning
large virtual memory areas that contain mostly unmapped regions.

This patch replaces the per-address lookup with a range walk using
walk_page_range_vma(). The range walker allows KSM to skip over entire
unmapped holes in a VMA, avoiding unnecessary lookups.
This problem was previously discussed in [1].

[1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/

Reported-by: craftfever <craftfever@airmail.cc>
Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
Co-developed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
  mm/ksm.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++-------
  1 file changed, 103 insertions(+), 13 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 3aed0478fdcef..8bd2b78c4f869 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2455,6 +2455,94 @@ static bool should_skip_rmap_item(struct folio *folio,
  	return true;
  }
  
+struct ksm_next_page_arg {
+	struct folio *folio;
+	struct page *page;
+	unsigned long addr;
+};
+
+static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
+		struct mm_walk *walk)
+{
+	struct ksm_next_page_arg *private = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	pte_t *start_ptep = NULL, *ptep, pte;
+	struct mm_struct *mm = walk->mm;
+	struct folio *folio;
+	struct page *page;
+	spinlock_t *ptl;
+	pmd_t pmd;
+
+	if (ksm_test_exit(mm))
+		return 0;
+	cond_resched();
+
+	pmd = pmdp_get_lockless(pmdp);
+	if (!pmd_present(pmd))
+		return 0;
+
+	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) {
+		ptl = pmd_lock(mm, pmdp);
+		pmd = pmdp_get(pmdp);
+
+		if (!pmd_present(pmd)) {
+			goto not_found_unlock;
+		} else if (pmd_leaf(pmd)) {
+			page = vm_normal_page_pmd(vma, addr, pmd);
+			if (!page)
+				goto not_found_unlock;
+			folio = page_folio(page);
+
+			if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+				goto not_found_unlock;
+
+			page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
+			goto found_unlock;
+		}
+		spin_unlock(ptl);
+	}
+
+	start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	if (!start_ptep)
+		return 0;
+
+	for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
+		pte = ptep_get(ptep);
+
+		if (!pte_present(pte))
+			continue;
+
+		page = vm_normal_page(vma, addr, pte);
+		if (!page)
+			continue;
+		folio = page_folio(page);
+
+		if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+			continue;
+		goto found_unlock;
+	}
+
+not_found_unlock:
+	spin_unlock(ptl);
+	if (start_ptep)
+		pte_unmap(start_ptep);
+	return 0;
+found_unlock:
+	folio_get(folio);
+	spin_unlock(ptl);
+	if (start_ptep)
+		pte_unmap(start_ptep);
+	private->page = page;
+	private->folio = folio;
+	private->addr = addr;
+	return 1;
+}
+
+static struct mm_walk_ops ksm_next_page_ops = {
+	.pmd_entry = ksm_next_page_pmd_entry,
+	.walk_lock = PGWALK_RDLOCK,
+};
+
  static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
  {
  	struct mm_struct *mm;
@@ -2542,21 +2630,23 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
  			ksm_scan.address = vma->vm_end;
  
  		while (ksm_scan.address < vma->vm_end) {
+			struct ksm_next_page_arg ksm_next_page_arg;
  			struct page *tmp_page = NULL;
-			struct folio_walk fw;
  			struct folio *folio;
-
-			if (ksm_test_exit(mm))
-				break;
-
-			folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
-			if (folio) {
-				if (!folio_is_zone_device(folio) &&
-				     folio_test_anon(folio)) {
-					folio_get(folio);
-					tmp_page = fw.page;
-				}
-				folio_walk_end(&fw, vma);
+			int found;
+
+			found = walk_page_range_vma(vma, ksm_scan.address,
+						    vma->vm_end,
+						    &ksm_next_page_ops,
+						    &ksm_next_page_arg);
+
+			if (found > 0) {
+				folio = ksm_next_page_arg.folio;
+				tmp_page = ksm_next_page_arg.page;
+				ksm_scan.address = ksm_next_page_arg.addr;
+			} else {
+				VM_WARN_ON_ONCE(found < 0);
+				ksm_scan.address = vma->vm_end - PAGE_SIZE;
  			}
  
  			if (tmp_page) {
-- 
2.51.0


-- 
Cheers

David / dhildenb

Re: [PATCH v3] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by Pedro Demarchi Gomes 3 months, 2 weeks ago

On 10/17/25 19:23, David Hildenbrand wrote:

> This patch does to much in a single patch which makes it
> rather hard to review.
>
> As a first step, we should focus on leaving most of
> scan_get_next_rmap_item() alone and only focus on replacing
> folio_walk by walk_page_range_vma().
>
> Follow-up cleanups could try cleaning up scan_get_next_rmap_item()
> -- and boy oh boy, does that function scream for quite some cleanups.
>
> This is something minimal based on your v3. I applied plenty of more
> cleanups and I wish we could further shrink the pmd_entry function,
> but I have to give up for today (well, it's already tomorrow :) ). 

Should I send a v4 to be applied on top of your minimal patch? This
v4 would eliminate the need of the for_each_vma using the test_walk
callback like the previous versions.
Thanks for your patience.

Re: [PATCH v3] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by David Hildenbrand 3 months, 2 weeks ago

On 21.10.25 05:00, Pedro Demarchi Gomes wrote:
> 
> On 10/17/25 19:23, David Hildenbrand wrote:
> 
>> This patch does to much in a single patch which makes it
>> rather hard to review.
>>
>> As a first step, we should focus on leaving most of
>> scan_get_next_rmap_item() alone and only focus on replacing
>> folio_walk by walk_page_range_vma().
>>
>> Follow-up cleanups could try cleaning up scan_get_next_rmap_item()
>> -- and boy oh boy, does that function scream for quite some cleanups.
>>
>> This is something minimal based on your v3. I applied plenty of more
>> cleanups and I wish we could further shrink the pmd_entry function,
>> but I have to give up for today (well, it's already tomorrow :) ).
> 
> Should I send a v4 to be applied on top of your minimal patch? This
> v4 would eliminate the need of the for_each_vma using the test_walk
> callback like the previous versions.

It would be good if you could test the rework I sent and see if you want 
to do any tweaks to it. It was a rather quick rework on my side.


Then resend that as v4, which is then minimal and we can reasonable add 
Fixes: + Cc: stable.

Right from that start we used follow_page() on each individual address.

So likely best to add

	Fixes: 31dbd01f3143 ("ksm: Kernel SamePage Merging")

Once that fix is in you can send further cleanups that are independent 
of the fix itself, like removing the for_each_vma() etc.

-- 
Cheers

David / dhildenb

Re: [PATCH v3] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by craftfever 3 months, 3 weeks ago


David Hildenbrand wrote:
> On 16.10.25 03:22, Pedro Demarchi Gomes wrote:
>> Currently, scan_get_next_rmap_item() walks every page address in a VMA
>> to locate mergeable pages. This becomes highly inefficient when scanning
>> large virtual memory areas that contain mostly unmapped regions.
>>
>> This patch replaces the per-address lookup with a range walk using
>> walk_page_range(). The range walker allows KSM to skip over entire
>> unmapped holes in a VMA, avoiding unnecessary lookups.
>> This problem was previously discussed in [1].
>>
>> [1] https://lore.kernel.org/linux- 
>> mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
>>
>> ---
> 
> This patch does to much in a single patch which makes it
> rather hard to review.
> 
> As a first step, we should focus on leaving most of
> scan_get_next_rmap_item() alone and only focus on replacing
> folio_walk by walk_page_range_vma().
> 
> Follow-up cleanups could try cleaning up scan_get_next_rmap_item()
> -- and boy oh boy, does that function scream for quite some cleanups.
> 
> This is something minimal based on your v3. I applied plenty of more
> cleanups and I wish we could further shrink the pmd_entry function,
> but I have to give up for today (well, it's already tomorrow :) ).
> 
> 
> Briefly tested with ksm selftests and my machine did not burn down my 
> building.
> 
> 
>  From d971b88056fe3fefe50e5d4fa5b359e8c8331b2c Mon Sep 17 00:00:00 2001
> From: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> Date: Wed, 15 Oct 2025 22:22:36 -0300
> Subject: [PATCH] ksm: use range-walk function to jump over holes in
>   scan_get_next_rmap_item
> 
> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions.
> 
> This patch replaces the per-address lookup with a range walk using
> walk_page_range_vma(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
> This problem was previously discussed in [1].
> 
> [1] https://lore.kernel.org/linux- 
> mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> 
> Reported-by: craftfever <craftfever@airmail.cc>
> Closes: https://lkml.kernel.org/ 
> r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> Co-developed-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>   mm/ksm.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++-------
>   1 file changed, 103 insertions(+), 13 deletions(-)
> 
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 3aed0478fdcef..8bd2b78c4f869 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -2455,6 +2455,94 @@ static bool should_skip_rmap_item(struct folio 
> *folio,
>       return true;
>   }
> 
> +struct ksm_next_page_arg {
> +    struct folio *folio;
> +    struct page *page;
> +    unsigned long addr;
> +};
> +
> +static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, 
> unsigned long end,
> +        struct mm_walk *walk)
> +{
> +    struct ksm_next_page_arg *private = walk->private;
> +    struct vm_area_struct *vma = walk->vma;
> +    pte_t *start_ptep = NULL, *ptep, pte;
> +    struct mm_struct *mm = walk->mm;
> +    struct folio *folio;
> +    struct page *page;
> +    spinlock_t *ptl;
> +    pmd_t pmd;
> +
> +    if (ksm_test_exit(mm))
> +        return 0;
> +    cond_resched();
> +
> +    pmd = pmdp_get_lockless(pmdp);
> +    if (!pmd_present(pmd))
> +        return 0;
> +
> +    if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) {
> +        ptl = pmd_lock(mm, pmdp);
> +        pmd = pmdp_get(pmdp);
> +
> +        if (!pmd_present(pmd)) {
> +            goto not_found_unlock;
> +        } else if (pmd_leaf(pmd)) {
> +            page = vm_normal_page_pmd(vma, addr, pmd);
> +            if (!page)
> +                goto not_found_unlock;
> +            folio = page_folio(page);
> +
> +            if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> +                goto not_found_unlock;
> +
> +            page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
> +            goto found_unlock;
> +        }
> +        spin_unlock(ptl);
> +    }
> +
> +    start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
> +    if (!start_ptep)
> +        return 0;
> +
> +    for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
> +        pte = ptep_get(ptep);
> +
> +        if (!pte_present(pte))
> +            continue;
> +
> +        page = vm_normal_page(vma, addr, pte);
> +        if (!page)
> +            continue;
> +        folio = page_folio(page);
> +
> +        if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> +            continue;
> +        goto found_unlock;
> +    }
> +
> +not_found_unlock:
> +    spin_unlock(ptl);
> +    if (start_ptep)
> +        pte_unmap(start_ptep);
> +    return 0;
> +found_unlock:
> +    folio_get(folio);
> +    spin_unlock(ptl);
> +    if (start_ptep)
> +        pte_unmap(start_ptep);
> +    private->page = page;
> +    private->folio = folio;
> +    private->addr = addr;
> +    return 1;
> +}
> +
> +static struct mm_walk_ops ksm_next_page_ops = {
> +    .pmd_entry = ksm_next_page_pmd_entry,
> +    .walk_lock = PGWALK_RDLOCK,
> +};
> +
>   static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>   {
>       struct mm_struct *mm;
> @@ -2542,21 +2630,23 @@ static struct ksm_rmap_item 
> *scan_get_next_rmap_item(struct page **page)
>               ksm_scan.address = vma->vm_end;
> 
>           while (ksm_scan.address < vma->vm_end) {
> +            struct ksm_next_page_arg ksm_next_page_arg;
>               struct page *tmp_page = NULL;
> -            struct folio_walk fw;
>               struct folio *folio;
> -
> -            if (ksm_test_exit(mm))
> -                break;
> -
> -            folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
> -            if (folio) {
> -                if (!folio_is_zone_device(folio) &&
> -                     folio_test_anon(folio)) {
> -                    folio_get(folio);
> -                    tmp_page = fw.page;
> -                }
> -                folio_walk_end(&fw, vma);
> +            int found;
> +
> +            found = walk_page_range_vma(vma, ksm_scan.address,
> +                            vma->vm_end,
> +                            &ksm_next_page_ops,
> +                            &ksm_next_page_arg);
> +
> +            if (found > 0) {
> +                folio = ksm_next_page_arg.folio;
> +                tmp_page = ksm_next_page_arg.page;
> +                ksm_scan.address = ksm_next_page_arg.addr;
> +            } else {
> +                VM_WARN_ON_ONCE(found < 0);
> +                ksm_scan.address = vma->vm_end - PAGE_SIZE;
>               }
> 
>               if (tmp_page) {


%)
Guys, I'm so sorry, I"m little confused, can you lease tell further by 
e-mail, when patch or couple of patches will be done, so it could 
properly tested, 'cause I'm little lost in this progress, is it ready or 
not, thank you)

Re: [PATCH v3] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by David Hildenbrand 3 months, 2 weeks ago

On 18.10.25 09:30, craftfever wrote:
> 
> 
> David Hildenbrand wrote:
>> On 16.10.25 03:22, Pedro Demarchi Gomes wrote:
>>> Currently, scan_get_next_rmap_item() walks every page address in a VMA
>>> to locate mergeable pages. This becomes highly inefficient when scanning
>>> large virtual memory areas that contain mostly unmapped regions.
>>>
>>> This patch replaces the per-address lookup with a range walk using
>>> walk_page_range(). The range walker allows KSM to skip over entire
>>> unmapped holes in a VMA, avoiding unnecessary lookups.
>>> This problem was previously discussed in [1].
>>>
>>> [1] https://lore.kernel.org/linux-
>>> mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
>>>
>>> ---
>>
>> This patch does to much in a single patch which makes it
>> rather hard to review.
>>
>> As a first step, we should focus on leaving most of
>> scan_get_next_rmap_item() alone and only focus on replacing
>> folio_walk by walk_page_range_vma().
>>
>> Follow-up cleanups could try cleaning up scan_get_next_rmap_item()
>> -- and boy oh boy, does that function scream for quite some cleanups.
>>
>> This is something minimal based on your v3. I applied plenty of more
>> cleanups and I wish we could further shrink the pmd_entry function,
>> but I have to give up for today (well, it's already tomorrow :) ).
>>
>>
>> Briefly tested with ksm selftests and my machine did not burn down my
>> building.
>>
>>
>>   From d971b88056fe3fefe50e5d4fa5b359e8c8331b2c Mon Sep 17 00:00:00 2001
>> From: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
>> Date: Wed, 15 Oct 2025 22:22:36 -0300
>> Subject: [PATCH] ksm: use range-walk function to jump over holes in
>>    scan_get_next_rmap_item
>>
>> Currently, scan_get_next_rmap_item() walks every page address in a VMA
>> to locate mergeable pages. This becomes highly inefficient when scanning
>> large virtual memory areas that contain mostly unmapped regions.
>>
>> This patch replaces the per-address lookup with a range walk using
>> walk_page_range_vma(). The range walker allows KSM to skip over entire
>> unmapped holes in a VMA, avoiding unnecessary lookups.
>> This problem was previously discussed in [1].
>>
>> [1] https://lore.kernel.org/linux-
>> mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
>>
>> Reported-by: craftfever <craftfever@airmail.cc>
>> Closes: https://lkml.kernel.org/
>> r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
>> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
>> Co-developed-by: David Hildenbrand <david@redhat.com>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
>>    mm/ksm.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++-------
>>    1 file changed, 103 insertions(+), 13 deletions(-)
>>
>> diff --git a/mm/ksm.c b/mm/ksm.c
>> index 3aed0478fdcef..8bd2b78c4f869 100644
>> --- a/mm/ksm.c
>> +++ b/mm/ksm.c
>> @@ -2455,6 +2455,94 @@ static bool should_skip_rmap_item(struct folio
>> *folio,
>>        return true;
>>    }
>>
>> +struct ksm_next_page_arg {
>> +    struct folio *folio;
>> +    struct page *page;
>> +    unsigned long addr;
>> +};
>> +
>> +static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr,
>> unsigned long end,
>> +        struct mm_walk *walk)
>> +{
>> +    struct ksm_next_page_arg *private = walk->private;
>> +    struct vm_area_struct *vma = walk->vma;
>> +    pte_t *start_ptep = NULL, *ptep, pte;
>> +    struct mm_struct *mm = walk->mm;
>> +    struct folio *folio;
>> +    struct page *page;
>> +    spinlock_t *ptl;
>> +    pmd_t pmd;
>> +
>> +    if (ksm_test_exit(mm))
>> +        return 0;
>> +    cond_resched();
>> +
>> +    pmd = pmdp_get_lockless(pmdp);
>> +    if (!pmd_present(pmd))
>> +        return 0;
>> +
>> +    if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) {
>> +        ptl = pmd_lock(mm, pmdp);
>> +        pmd = pmdp_get(pmdp);
>> +
>> +        if (!pmd_present(pmd)) {
>> +            goto not_found_unlock;
>> +        } else if (pmd_leaf(pmd)) {
>> +            page = vm_normal_page_pmd(vma, addr, pmd);
>> +            if (!page)
>> +                goto not_found_unlock;
>> +            folio = page_folio(page);
>> +
>> +            if (folio_is_zone_device(folio) || !folio_test_anon(folio))
>> +                goto not_found_unlock;
>> +
>> +            page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
>> +            goto found_unlock;
>> +        }
>> +        spin_unlock(ptl);
>> +    }
>> +
>> +    start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
>> +    if (!start_ptep)
>> +        return 0;
>> +
>> +    for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
>> +        pte = ptep_get(ptep);
>> +
>> +        if (!pte_present(pte))
>> +            continue;
>> +
>> +        page = vm_normal_page(vma, addr, pte);
>> +        if (!page)
>> +            continue;
>> +        folio = page_folio(page);
>> +
>> +        if (folio_is_zone_device(folio) || !folio_test_anon(folio))
>> +            continue;
>> +        goto found_unlock;
>> +    }
>> +
>> +not_found_unlock:
>> +    spin_unlock(ptl);
>> +    if (start_ptep)
>> +        pte_unmap(start_ptep);
>> +    return 0;
>> +found_unlock:
>> +    folio_get(folio);
>> +    spin_unlock(ptl);
>> +    if (start_ptep)
>> +        pte_unmap(start_ptep);
>> +    private->page = page;
>> +    private->folio = folio;
>> +    private->addr = addr;
>> +    return 1;
>> +}
>> +
>> +static struct mm_walk_ops ksm_next_page_ops = {
>> +    .pmd_entry = ksm_next_page_pmd_entry,
>> +    .walk_lock = PGWALK_RDLOCK,
>> +};
>> +
>>    static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>>    {
>>        struct mm_struct *mm;
>> @@ -2542,21 +2630,23 @@ static struct ksm_rmap_item
>> *scan_get_next_rmap_item(struct page **page)
>>                ksm_scan.address = vma->vm_end;
>>
>>            while (ksm_scan.address < vma->vm_end) {
>> +            struct ksm_next_page_arg ksm_next_page_arg;
>>                struct page *tmp_page = NULL;
>> -            struct folio_walk fw;
>>                struct folio *folio;
>> -
>> -            if (ksm_test_exit(mm))
>> -                break;
>> -
>> -            folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
>> -            if (folio) {
>> -                if (!folio_is_zone_device(folio) &&
>> -                     folio_test_anon(folio)) {
>> -                    folio_get(folio);
>> -                    tmp_page = fw.page;
>> -                }
>> -                folio_walk_end(&fw, vma);
>> +            int found;
>> +
>> +            found = walk_page_range_vma(vma, ksm_scan.address,
>> +                            vma->vm_end,
>> +                            &ksm_next_page_ops,
>> +                            &ksm_next_page_arg);
>> +
>> +            if (found > 0) {
>> +                folio = ksm_next_page_arg.folio;
>> +                tmp_page = ksm_next_page_arg.page;
>> +                ksm_scan.address = ksm_next_page_arg.addr;
>> +            } else {
>> +                VM_WARN_ON_ONCE(found < 0);
>> +                ksm_scan.address = vma->vm_end - PAGE_SIZE;
>>                }
>>
>>                if (tmp_page) {
> 
> 
> %)
> Guys, I'm so sorry, I"m little confused, can you lease tell further by
> e-mail, when patch or couple of patches will be done, so it could
> properly tested, 'cause I'm little lost in this progress, is it ready or
> not, thank you)

In general, we consider code ready once it was reviewed and acked by a 
maintainer.

Andrew usually throws patches ahead of time into mm/mm-unstable while 
review is still going one. That does not mean yet that the patches will 
go upstream, it's merely do give them some initial exposure to build 
bots etc.

You can feel free to test the revised patch submitted by me inline, or 
wait for a v4.

-- 
Cheers

David / dhildenb

Re: [PATCH v3] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by David Hildenbrand 3 months, 3 weeks ago

On 16.10.25 03:22, Pedro Demarchi Gomes wrote:
> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions.
> 
> This patch replaces the per-address lookup with a range walk using
> walk_page_range(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
> This problem was previously discussed in [1].
> 
> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> 
> ---
> 
> v3:
>    - Treat THPs in ksm_pmd_entry
>    - Update ksm_scan.address outside walk_page_range
>    - Change goto to while loop
> 
> v2: https://lore.kernel.org/all/20251014151126.87589-1-pedrodemargomes@gmail.com/
>    - Use pmd_entry to walk page range
>    - Use cond_resched inside pmd_entry()
>    - walk_page_range returns page+folio
> 
> v1: https://lore.kernel.org/all/20251014055828.124522-1-pedrodemargomes@gmail.com/
> 
> Reported-by: craftfever <craftfever@airmail.cc>
> Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
> Suggested-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> ---

This patch will need some more work.

@Andrew, please drop it for now.

-- 
Cheers

David / dhildenb

Re: [PATCH v3] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by Andrew Morton 3 months, 3 weeks ago

On Wed, 15 Oct 2025 22:22:36 -0300 Pedro Demarchi Gomes <pedrodemargomes@gmail.com> wrote:

> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions.
> 
> This patch replaces the per-address lookup with a range walk using
> walk_page_range(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
> This problem was previously discussed in [1].
> 
> ...
>
> Reported-by: craftfever <craftfever@airmail.cc>
> Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
> Suggested-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>

Is Fixes: b1d3e9bbccb4 ("mm/ksm: convert scan_get_next_rmap_item() from
follow_page() to folio_walk") appropriate?  

The problem which is being addressed seems pretty serious.  What do
people think about proposing a -stable backport of this fix?

It would be better if this changelog were to describe the user-visible
effects of the problem.  A copy-n-paste from
https://bugzilla.kernel.org/show_bug.cgi?id=220599 would suffice.

Re: [PATCH v3] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by David Hildenbrand 3 months, 2 weeks ago

On 16.10.25 23:07, Andrew Morton wrote:
> On Wed, 15 Oct 2025 22:22:36 -0300 Pedro Demarchi Gomes <pedrodemargomes@gmail.com> wrote:
> 
>> Currently, scan_get_next_rmap_item() walks every page address in a VMA
>> to locate mergeable pages. This becomes highly inefficient when scanning
>> large virtual memory areas that contain mostly unmapped regions.
>>
>> This patch replaces the per-address lookup with a range walk using
>> walk_page_range(). The range walker allows KSM to skip over entire
>> unmapped holes in a VMA, avoiding unnecessary lookups.
>> This problem was previously discussed in [1].
>>
>> ...
>>
>> Reported-by: craftfever <craftfever@airmail.cc>
>> Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
>> Suggested-by: David Hildenbrand <david@redhat.com>
>> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> 
> Is Fixes: b1d3e9bbccb4 ("mm/ksm: convert scan_get_next_rmap_item() from
> follow_page() to folio_walk") appropriate?

No.

That commit is not the problem.

The problem probably goes back when scan_get_next_rmap_item() was first 
introduced (likely when KSM was added): it simply never was optimized to 
deal with large sparse memory areas.

> 
> The problem which is being addressed seems pretty serious.  What do
> people think about proposing a -stable backport of this fix?

We'll likely have to backport it to each and every stable tree. We could 
think about limiting backports only to kernels that actually allow for 
enabling KSM for a complete process.

So that would make sense to me.

-- 
Cheers

David / dhildenb

Re: [PATCH v3] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by craftfever 3 months, 3 weeks ago


Andrew Morton wrote:
> On Wed, 15 Oct 2025 22:22:36 -0300 Pedro Demarchi Gomes <pedrodemargomes@gmail.com> wrote:
> 
>> Currently, scan_get_next_rmap_item() walks every page address in a VMA
>> to locate mergeable pages. This becomes highly inefficient when scanning
>> large virtual memory areas that contain mostly unmapped regions.
>>
>> This patch replaces the per-address lookup with a range walk using
>> walk_page_range(). The range walker allows KSM to skip over entire
>> unmapped holes in a VMA, avoiding unnecessary lookups.
>> This problem was previously discussed in [1].
>>
>> ...
>>
>> Reported-by: craftfever <craftfever@airmail.cc>
>> Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
>> Suggested-by: David Hildenbrand <david@redhat.com>
>> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> 
> Is Fixes: b1d3e9bbccb4 ("mm/ksm: convert scan_get_next_rmap_item() from
> follow_page() to folio_walk") appropriate?
> 
> The problem which is being addressed seems pretty serious.  What do
> people think about proposing a -stable backport of this fix?
> 
> It would be better if this changelog were to describe the user-visible
> effects of the problem.  A copy-n-paste from
> https://bugzilla.kernel.org/show_bug.cgi?id=220599 would suffice.

Emergency Update:

A moment ago I had ksmd crashed, so patch really needs further work. Trace:

[ 2472.174930] BUG: Bad page map in process ksmd  pte:fffffffffffff600
[ 2472.174938] pgd:11394a067 p4d:11394a067 pud:100f96067 pmd:102c68067
[ 2472.174941] addr:00007f2ae1511000 vm_flags:c8100073 
anon_vma:ffff8ab79bcea1a0 mapping:0000000000000000 index:7f2ae1511
[ 2472.174944] file:(null) fault:0x0 mmap:0x0 mmap_prepare: 0x0 
read_folio:0x0
[ 2472.174978] CPU: 2 UID: 0 PID: 52 Comm: ksmd Tainted: G S  BU     OE 
      6.18.0-rc1-1-git-00014-g1f4a222b0e33-dirty #4 PREEMPT(voluntary) 
b9513c77908d39edabd314a5ac9b34ef2c53c2c8
[ 2472.174984] Tainted: [S]=CPU_OUT_OF_SPEC, [B]=BAD_PAGE, [U]=USER, 
[O]=OOT_MODULE, [E]=UNSIGNED_MODULE
[ 2472.174985] Hardware name: FUJITSU LIFEBOOK AH532/G21/FJNBB1D, BIOS 
Version 1.12 06/10/2019
[ 2472.174987] Sched_ext: 
rusty_1.0.16_ge25cc6e5_dirty_x86_64_unknown_linux_gnu (enabled+all), 
task: runnable_at=-5ms
[ 2472.174989] Call Trace:
[ 2472.174990]  <TASK>
[ 2472.174992]  dump_stack_lvl+0x5d/0x80
[ 2472.174997]  print_bad_page_map.cold+0x26d/0x355
[ 2472.175000]  ? ___pte_offset_map+0x1b/0x160
[ 2472.175005]  vm_normal_page+0xf4/0x100
[ 2472.175010]  ksm_pmd_entry+0x1cf/0x2f0
[ 2472.175014]  walk_pgd_range+0x5a2/0xb50
[ 2472.175020]  __walk_page_range+0x6e/0x1e0
[ 2472.175025]  walk_page_range_mm+0x150/0x210
[ 2472.175030]  ksm_scan_thread+0x166/0x2080
[ 2472.175037]  ? __pfx_ksm_scan_thread+0x10/0x10
[ 2472.175042]  kthread+0xfc/0x240
[ 2472.175046]  ? __pfx_kthread+0x10/0x10
[ 2472.175050]  ret_from_fork+0x1c2/0x1f0
[ 2472.175053]  ? __pfx_kthread+0x10/0x10
[ 2472.175057]  ret_from_fork_asm+0x1a/0x30
[ 2472.175062]  </TASK>
[ 2472.175132] BUG: Bad page map in process ksmd  pte:fffffffffffff600
[ 2472.175139] pgd:11394a067 p4d:11394a067 pud:100f96067 pmd:11989b067
[ 2472.175143] addr:00007f2ae1712000 vm_flags:c8100073 
anon_vma:ffff8ab79bcea1a0 mapping:0000000000000000 index:7f2ae1712



KSM crashed.

Re: [PATCH v3] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by craftfever 3 months, 3 weeks ago


Andrew Morton wrote:
> On Wed, 15 Oct 2025 22:22:36 -0300 Pedro Demarchi Gomes <pedrodemargomes@gmail.com> wrote:
> 
>> Currently, scan_get_next_rmap_item() walks every page address in a VMA
>> to locate mergeable pages. This becomes highly inefficient when scanning
>> large virtual memory areas that contain mostly unmapped regions.
>>
>> This patch replaces the per-address lookup with a range walk using
>> walk_page_range(). The range walker allows KSM to skip over entire
>> unmapped holes in a VMA, avoiding unnecessary lookups.
>> This problem was previously discussed in [1].
>>
>> ...
>>
>> Reported-by: craftfever <craftfever@airmail.cc>
>> Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
>> Suggested-by: David Hildenbrand <david@redhat.com>
>> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> 
> Is Fixes: b1d3e9bbccb4 ("mm/ksm: convert scan_get_next_rmap_item() from
> follow_page() to folio_walk") appropriate?
> 
> The problem which is being addressed seems pretty serious.  What do
> people think about proposing a -stable backport of this fix?
> 
> It would be better if this changelog were to describe the user-visible
> effects of the problem.  A copy-n-paste from
> https://bugzilla.kernel.org/show_bug.cgi?id=220599 would suffice.

I must admit, that with this particular fix with scanning only mapped 
pages with actual data, effectiveness of KSM even better according to 
stats, and CPU consumption even smaller tan was in 6.12-6.15 kernels, so 
I think it doesn't necessary to revert that 6.12 folio_walk commit, 
applying to master tree and backporting this topic fix into stable 
branch is sufficient.

Re: [PATCH v3] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by craftfever 3 months, 3 weeks ago


Pedro Demarchi Gomes wrote:
> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions.
> 
> This patch replaces the per-address lookup with a range walk using
> walk_page_range(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
> This problem was previously discussed in [1].
> 
> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> 
> ---
> 
> v3:
>    - Treat THPs in ksm_pmd_entry
>    - Update ksm_scan.address outside walk_page_range
>    - Change goto to while loop
> 
> v2: https://lore.kernel.org/all/20251014151126.87589-1-pedrodemargomes@gmail.com/
>    - Use pmd_entry to walk page range
>    - Use cond_resched inside pmd_entry()
>    - walk_page_range returns page+folio
> 
> v1: https://lore.kernel.org/all/20251014055828.124522-1-pedrodemargomes@gmail.com/
> 
> Reported-by: craftfever <craftfever@airmail.cc>
> Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
> Suggested-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> ---
>   mm/ksm.c | 185 ++++++++++++++++++++++++++++++++++++++++---------------
>   1 file changed, 135 insertions(+), 50 deletions(-)
> 
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 3aed0478fdce..403e4f102f07 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -2455,14 +2455,119 @@ static bool should_skip_rmap_item(struct folio *folio,
>   	return true;
>   }
>   
> +struct ksm_walk_private {
> +	struct page *page;
> +	struct folio *folio;
> +	struct vm_area_struct *vma;
> +	unsigned long address;
> +};
> +
> +static int ksm_walk_test(unsigned long addr, unsigned long next, struct mm_walk *walk)
> +{
> +	struct vm_area_struct *vma = walk->vma;
> +	struct ksm_walk_private *private;
> +
> +	if (!(vma->vm_flags & VM_MERGEABLE))
> +		return 1;
> +
> +	private = (struct ksm_walk_private *) walk->private;
> +	private->address = vma->vm_end;
> +
> +	if (!vma->anon_vma)
> +		return 1;
> +
> +	return 0;
> +}
> +
> +static int ksm_pmd_entry(pmd_t *pmd, unsigned long addr,
> +			    unsigned long end, struct mm_walk *walk)
> +{
> +	struct mm_struct *mm = walk->mm;
> +	struct vm_area_struct *vma = walk->vma;
> +	struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private;
> +	struct folio *folio;
> +	pte_t *start_pte, *pte, ptent;
> +	pmd_t pmde;
> +	struct page *page;
> +	spinlock_t *ptl;
> +	int ret = 0;
> +
> +	if (ksm_test_exit(mm))
> +		return 1;
> +
> +	ptl = pmd_lock(mm, pmd);
> +	pmde = pmdp_get(pmd);
> +
> +	if (!pmd_present(pmde))
> +		goto pmd_out;
> +
> +	if (!pmd_trans_huge(pmde))
> +		goto pte_table;
> +
> +	page = vm_normal_page_pmd(vma, addr, pmde);
> +
> +	if (!page)
> +		goto pmd_out;
> +
> +	folio = page_folio(page);
> +	if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> +		goto pmd_out;
> +
> +	ret = 1;
> +	folio_get(folio);
> +	private->page = page + ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
> +	private->folio = folio;
> +	private->vma = vma;
> +	private->address = addr;
> +pmd_out:
> +	spin_unlock(ptl);
> +	return ret;
> +
> +pte_table:
> +	spin_unlock(ptl);
> +
> +	start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
> +	if (!start_pte)
> +		return 0;
> +
> +	for (; addr < end; pte++, addr += PAGE_SIZE) {
> +		ptent = ptep_get(pte);
> +		page = vm_normal_page(vma, addr, ptent);
> +
> +		if (!page)
> +			continue;
> +
> +		folio = page_folio(page);
> +		if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> +			continue;
> +
> +		ret = 1;
> +		folio_get(folio);
> +		private->page = page;
> +		private->folio = folio;
> +		private->vma = vma;
> +		private->address = addr;
> +		break;
> +	}
> +	pte_unmap_unlock(start_pte, ptl);
> +
> +	cond_resched();
> +	return ret;
> +}
> +
> +struct mm_walk_ops walk_ops = {
> +	.pmd_entry = ksm_pmd_entry,
> +	.test_walk = ksm_walk_test,
> +	.walk_lock = PGWALK_RDLOCK,
> +};
> +
>   static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>   {
>   	struct mm_struct *mm;
>   	struct ksm_mm_slot *mm_slot;
>   	struct mm_slot *slot;
> -	struct vm_area_struct *vma;
>   	struct ksm_rmap_item *rmap_item;
> -	struct vma_iterator vmi;
> +	struct ksm_walk_private walk_private;
>   	int nid;
>   
>   	if (list_empty(&ksm_mm_head.slot.mm_node))
> @@ -2527,64 +2632,44 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>   
>   	slot = &mm_slot->slot;
>   	mm = slot->mm;
> -	vma_iter_init(&vmi, mm, ksm_scan.address);
>   
>   	mmap_read_lock(mm);
>   	if (ksm_test_exit(mm))
>   		goto no_vmas;
>   
> -	for_each_vma(vmi, vma) {
> -		if (!(vma->vm_flags & VM_MERGEABLE))
> -			continue;
> -		if (ksm_scan.address < vma->vm_start)
> -			ksm_scan.address = vma->vm_start;
> -		if (!vma->anon_vma)
> -			ksm_scan.address = vma->vm_end;
> -
> -		while (ksm_scan.address < vma->vm_end) {
> -			struct page *tmp_page = NULL;
> -			struct folio_walk fw;
> -			struct folio *folio;
> +	while (true) {
> +		struct folio *folio;
>   
> -			if (ksm_test_exit(mm))
> -				break;
> +		walk_private.page = NULL;
> +		walk_private.folio = NULL;
> +		walk_private.address = ksm_scan.address;
>   
> -			folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
> -			if (folio) {
> -				if (!folio_is_zone_device(folio) &&
> -				     folio_test_anon(folio)) {
> -					folio_get(folio);
> -					tmp_page = fw.page;
> -				}
> -				folio_walk_end(&fw, vma);
> -			}
> +		walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
> +		ksm_scan.address = walk_private.address;
> +		if (!walk_private.page)
> +			break;
> +
> +		folio = walk_private.folio;
> +		flush_anon_page(walk_private.vma, walk_private.page, ksm_scan.address);
> +		flush_dcache_page(walk_private.page);
> +		rmap_item = get_next_rmap_item(mm_slot,
> +			ksm_scan.rmap_list, ksm_scan.address);
> +		if (rmap_item) {
> +			ksm_scan.rmap_list =
> +					&rmap_item->rmap_list;
>   
> -			if (tmp_page) {
> -				flush_anon_page(vma, tmp_page, ksm_scan.address);
> -				flush_dcache_page(tmp_page);
> -				rmap_item = get_next_rmap_item(mm_slot,
> -					ksm_scan.rmap_list, ksm_scan.address);
> -				if (rmap_item) {
> -					ksm_scan.rmap_list =
> -							&rmap_item->rmap_list;
> -
> -					if (should_skip_rmap_item(folio, rmap_item)) {
> -						folio_put(folio);
> -						goto next_page;
> -					}
> -
> -					ksm_scan.address += PAGE_SIZE;
> -					*page = tmp_page;
> -				} else {
> -					folio_put(folio);
> -				}
> -				mmap_read_unlock(mm);
> -				return rmap_item;
> -			}
> -next_page:
>   			ksm_scan.address += PAGE_SIZE;
> -			cond_resched();
> +			if (should_skip_rmap_item(folio, rmap_item)) {
> +				folio_put(folio);
> +				continue;
> +			}
> +
> +			*page = walk_private.page;
> +		} else {
> +			folio_put(folio);
>   		}
> +		mmap_read_unlock(mm);
> +		return rmap_item;
>   	}
>   
>   	if (ksm_test_exit(mm)) {

I've finally compiled linux-git 6.18rc1 with topic v3 version of patch 
and I'm highly impressed. The footprint of KSM even lighter that it was 
in prior to 6.15 version and system overall seems much more performant 
even without relation to KSM performance. It would be exciting release, 
6.18 definitely deserves be the next LTS version, big thank to you. The 
new algorithm with page-walk and revising VMA working absolutely best.