[PATCH v4] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Pedro Demarchi Gomes posted 1 patch 3 months, 2 weeks ago
There is a newer version of this series
mm/ksm.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 104 insertions(+), 9 deletions(-)
[PATCH v4] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
Posted by Pedro Demarchi Gomes 3 months, 2 weeks ago
Currently, scan_get_next_rmap_item() walks every page address in a VMA
to locate mergeable pages. This becomes highly inefficient when scanning
large virtual memory areas that contain mostly unmapped regions.

This patch replaces the per-address lookup with a range walk using
walk_page_range(). The range walker allows KSM to skip over entire
unmapped holes in a VMA, avoiding unnecessary lookups.
This problem was previously discussed in [1].

[1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/

---

v4:
  - Make minimal changes to replace folio_walk by walk_page_range_vma

v3: https://lore.kernel.org/all/20251016012236.4189-1-pedrodemargomes@gmail.com/
  - Treat THPs in ksm_pmd_entry
  - Update ksm_scan.address outside walk_page_range
  - Change goto to while loop

v2: https://lore.kernel.org/all/20251014151126.87589-1-pedrodemargomes@gmail.com/
  - Use pmd_entry to walk page range
  - Use cond_resched inside pmd_entry()
  - walk_page_range returns page+folio

v1: https://lore.kernel.org/all/20251014055828.124522-1-pedrodemargomes@gmail.com/

Reported-by: craftfever <craftfever@airmail.cc>
Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
Suggested-by: David Hildenbrand <david@redhat.com>
Co-developed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Fixes: 31dbd01f3143 ("ksm: Kernel SamePage Merging")
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
---
 mm/ksm.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 104 insertions(+), 9 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 3aed0478fdce..4f672f4f2140 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2455,6 +2455,95 @@ static bool should_skip_rmap_item(struct folio *folio,
 	return true;
 }
 
+struct ksm_next_page_arg {
+	struct folio *folio;
+	struct page *page;
+	unsigned long addr;
+};
+
+static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
+		struct mm_walk *walk)
+{
+	struct ksm_next_page_arg *private = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	pte_t *start_ptep = NULL, *ptep, pte;
+	struct mm_struct *mm = walk->mm;
+	struct folio *folio;
+	struct page *page;
+	spinlock_t *ptl;
+	pmd_t pmd;
+
+	if (ksm_test_exit(mm))
+		return 0;
+
+	cond_resched();
+
+	pmd = pmdp_get_lockless(pmdp);
+	if (!pmd_present(pmd))
+		return 0;
+
+	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) {
+		ptl = pmd_lock(mm, pmdp);
+		pmd = pmdp_get(pmdp);
+
+		if (!pmd_present(pmd)) {
+			goto not_found_unlock;
+		} else if (pmd_leaf(pmd)) {
+			page = vm_normal_page_pmd(vma, addr, pmd);
+			if (!page)
+				goto not_found_unlock;
+			folio = page_folio(page);
+
+			if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+				goto not_found_unlock;
+
+			page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
+			goto found_unlock;
+		}
+		spin_unlock(ptl);
+	}
+
+	start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	if (!start_ptep)
+		return 0;
+
+	for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
+		pte = ptep_get(ptep);
+
+		if (!pte_present(pte))
+			continue;
+
+		page = vm_normal_page(vma, addr, pte);
+		if (!page)
+			continue;
+		folio = page_folio(page);
+
+		if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+			continue;
+		goto found_unlock;
+	}
+
+not_found_unlock:
+	spin_unlock(ptl);
+	if (start_ptep)
+		pte_unmap(start_ptep);
+	return 0;
+found_unlock:
+	folio_get(folio);
+	spin_unlock(ptl);
+	if (start_ptep)
+		pte_unmap(start_ptep);
+	private->page = page;
+	private->folio = folio;
+	private->addr = addr;
+	return 1;
+}
+
+static struct mm_walk_ops ksm_next_page_ops = {
+	.pmd_entry = ksm_next_page_pmd_entry,
+	.walk_lock = PGWALK_RDLOCK,
+};
+
 static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
 {
 	struct mm_struct *mm;
@@ -2542,21 +2631,27 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
 			ksm_scan.address = vma->vm_end;
 
 		while (ksm_scan.address < vma->vm_end) {
+			struct ksm_next_page_arg ksm_next_page_arg;
 			struct page *tmp_page = NULL;
-			struct folio_walk fw;
 			struct folio *folio;
 
 			if (ksm_test_exit(mm))
 				break;
 
-			folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
-			if (folio) {
-				if (!folio_is_zone_device(folio) &&
-				     folio_test_anon(folio)) {
-					folio_get(folio);
-					tmp_page = fw.page;
-				}
-				folio_walk_end(&fw, vma);
+			int found;
+
+			found = walk_page_range_vma(vma, ksm_scan.address,
+						    vma->vm_end,
+						    &ksm_next_page_ops,
+						    &ksm_next_page_arg);
+
+			if (found > 0) {
+				folio = ksm_next_page_arg.folio;
+				tmp_page = ksm_next_page_arg.page;
+				ksm_scan.address = ksm_next_page_arg.addr;
+			} else {
+				VM_WARN_ON_ONCE(found < 0);
+				ksm_scan.address = vma->vm_end - PAGE_SIZE;
 			}
 
 			if (tmp_page) {
-- 
2.43.0
Re: [PATCH v4] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
Posted by Andrew Morton 3 months, 2 weeks ago
On Wed, 22 Oct 2025 12:30:59 -0300 Pedro Demarchi Gomes <pedrodemargomes@gmail.com> wrote:

> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions.
> 
> This patch replaces the per-address lookup with a range walk using
> walk_page_range(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
> This problem was previously discussed in [1].
> 
> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> 

Thanks.  It would be helpful of the changelog were to tell people how
significant this change is for our users.

> Reported-by: craftfever <craftfever@airmail.cc>
> Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io

Buried in here is a claim that large amount of CPU are being used, but
nothing quantitative.

So is there something we can tell people who are looking at this patch
in Feb 2026 and wondering "hm, should I add that to our kernel"?

> Suggested-by: David Hildenbrand <david@redhat.com>
> Co-developed-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> Fixes: 31dbd01f3143 ("ksm: Kernel SamePage Merging")

If the observed runtime problem is bad enough then a cc:stable might be
justified.  But a description of that observed runtime behavior would
be needed for that, please.
Re: [PATCH v4] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
Posted by David Hildenbrand 3 months, 2 weeks ago
On 22.10.25 22:31, Andrew Morton wrote:
> On Wed, 22 Oct 2025 12:30:59 -0300 Pedro Demarchi Gomes <pedrodemargomes@gmail.com> wrote:
> 
>> Currently, scan_get_next_rmap_item() walks every page address in a VMA
>> to locate mergeable pages. This becomes highly inefficient when scanning
>> large virtual memory areas that contain mostly unmapped regions.
>>
>> This patch replaces the per-address lookup with a range walk using
>> walk_page_range(). The range walker allows KSM to skip over entire
>> unmapped holes in a VMA, avoiding unnecessary lookups.
>> This problem was previously discussed in [1].
>>
>> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
>>
> 
> Thanks.  It would be helpful of the changelog were to tell people how
> significant this change is for our users.
> 
>> Reported-by: craftfever <craftfever@airmail.cc>
>> Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
> 
> Buried in here is a claim that large amount of CPU are being used, but
> nothing quantitative.
> 
> So is there something we can tell people who are looking at this patch
> in Feb 2026 and wondering "hm, should I add that to our kernel"?
> 
>> Suggested-by: David Hildenbrand <david@redhat.com>
>> Co-developed-by: David Hildenbrand <david@redhat.com>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> Fixes: 31dbd01f3143 ("ksm: Kernel SamePage Merging")
> 
> If the observed runtime problem is bad enough then a cc:stable might be
> justified.  But a description of that observed runtime behavior would
> be needed for that, please.

Agreed.

With the following simple program

#include <unistd.h>
#include <stdio.h>
#include <sys/mman.h>

/* 32 TiB */
const size_t size = 32ul * 1024 * 1024 * 1024 * 1024;

int main() {
         char *area = mmap(NULL, size, PROT_READ | PROT_WRITE,
                           MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0);

         if (area == MAP_FAILED) {
                 perror("mmap() failed\n");
                 return -1;
         }

         /* Populate a single page such that we get an anon_vma. */
         *area = 0;

         /* Enable KSM. */
         madvise(area, size, MADV_MERGEABLE);
         pause();
         return 0;
}

$ ./ksm-sparse  &
$ echo 1 > /sys/kernel/mm/ksm/run

ksmd goes to 100% for quite a long time.

Now imagine if a cloud user spins up a couple of these programs.

KSM in the system is essentially deadlocked not able to deduplicate
anything of value.

@Pedro, can you incorporate all that in the patch description?

-- 
Cheers

David / dhildenb
Re: [PATCH v4] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
Posted by craftfever 3 months, 2 weeks ago

David Hildenbrand wrote:
> On 22.10.25 22:31, Andrew Morton wrote:
>> On Wed, 22 Oct 2025 12:30:59 -0300 Pedro Demarchi Gomes 
>> <pedrodemargomes@gmail.com> wrote:
>>
>>> Currently, scan_get_next_rmap_item() walks every page address in a VMA
>>> to locate mergeable pages. This becomes highly inefficient when scanning
>>> large virtual memory areas that contain mostly unmapped regions.
>>>
>>> This patch replaces the per-address lookup with a range walk using
>>> walk_page_range(). The range walker allows KSM to skip over entire
>>> unmapped holes in a VMA, avoiding unnecessary lookups.
>>> This problem was previously discussed in [1].
>>>
>>> [1] https://lore.kernel.org/linux- 
>>> mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
>>>
>>
>> Thanks.  It would be helpful of the changelog were to tell people how
>> significant this change is for our users.
>>
>>> Reported-by: craftfever <craftfever@airmail.cc>
>>> Closes: https://lkml.kernel.org/ 
>>> r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
>>
>> Buried in here is a claim that large amount of CPU are being used, but
>> nothing quantitative.
>>
>> So is there something we can tell people who are looking at this patch
>> in Feb 2026 and wondering "hm, should I add that to our kernel"?
>>
>>> Suggested-by: David Hildenbrand <david@redhat.com>
>>> Co-developed-by: David Hildenbrand <david@redhat.com>
>>> Signed-off-by: David Hildenbrand <david@redhat.com>
>>> Fixes: 31dbd01f3143 ("ksm: Kernel SamePage Merging")
>>
>> If the observed runtime problem is bad enough then a cc:stable might be
>> justified.  But a description of that observed runtime behavior would
>> be needed for that, please.
> 
> Agreed.
> 
> With the following simple program
> 
> #include <unistd.h>
> #include <stdio.h>
> #include <sys/mman.h>
> 
> /* 32 TiB */
> const size_t size = 32ul * 1024 * 1024 * 1024 * 1024;
> 
> int main() {
>          char *area = mmap(NULL, size, PROT_READ | PROT_WRITE,
>                            MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0);
> 
>          if (area == MAP_FAILED) {
>                  perror("mmap() failed\n");
>                  return -1;
>          }
> 
>          /* Populate a single page such that we get an anon_vma. */
>          *area = 0;
> 
>          /* Enable KSM. */
>          madvise(area, size, MADV_MERGEABLE);
>          pause();
>          return 0;
> }
> 
> $ ./ksm-sparse  &
> $ echo 1 > /sys/kernel/mm/ksm/run
> 
> ksmd goes to 100% for quite a long time.
> 
> Now imagine if a cloud user spins up a couple of these programs.
> 
> KSM in the system is essentially deadlocked not able to deduplicate
> anything of value.
> 
> @Pedro, can you incorporate all that in the patch description?
> 

Thanks for example and explanation, that's exactly what I meant. Big 
datacenters and servers are primary use cases with Linux< for example, 
when many VMs are using and KSM operation have to be very robust to deal 
with so huge amount of memory. And, as being said, that bug is happened 
with consumers apps, like Chromkum/Electron (and based on it, like VS 
Code), when user decides to apply KSM for his apps. This patch is really 
necessary to apply in master branch and very preferably backport 
6.17-stable branch, it has high importance. I've tested by the way v4, 
and it's fine, stable, effective and very lite. Is v5 version with more 
comprehensive description is final version?
Re: [PATCH v4] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
Posted by Pedro Demarchi Gomes 3 months, 2 weeks ago
On 10/22/25 17:52, David Hildenbrand wrote:

> On 22.10.25 22:31, Andrew Morton wrote:
>> On Wed, 22 Oct 2025 12:30:59 -0300 Pedro Demarchi Gomes <pedrodemargomes@gmail.com> wrote:
>>
>>> Currently, scan_get_next_rmap_item() walks every page address in a VMA
>>> to locate mergeable pages. This becomes highly inefficient when scanning
>>> large virtual memory areas that contain mostly unmapped regions.
>>>
>>> This patch replaces the per-address lookup with a range walk using
>>> walk_page_range(). The range walker allows KSM to skip over entire
>>> unmapped holes in a VMA, avoiding unnecessary lookups.
>>> This problem was previously discussed in [1].
>>>
>>> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
>>>
>>
>> Thanks.  It would be helpful of the changelog were to tell people how
>> significant this change is for our users.
>>
>>> Reported-by: craftfever <craftfever@airmail.cc>
>>> Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
>>
>> Buried in here is a claim that large amount of CPU are being used, but
>> nothing quantitative.
>>
>> So is there something we can tell people who are looking at this patch
>> in Feb 2026 and wondering "hm, should I add that to our kernel"?
>>
>>> Suggested-by: David Hildenbrand <david@redhat.com>
>>> Co-developed-by: David Hildenbrand <david@redhat.com>
>>> Signed-off-by: David Hildenbrand <david@redhat.com>
>>> Fixes: 31dbd01f3143 ("ksm: Kernel SamePage Merging")
>>
>> If the observed runtime problem is bad enough then a cc:stable might be
>> justified.  But a description of that observed runtime behavior would
>> be needed for that, please.
>
> Agreed.
>
> With the following simple program
>
> #include <unistd.h>
> #include <stdio.h>
> #include <sys/mman.h>
>
> /* 32 TiB */
> const size_t size = 32ul * 1024 * 1024 * 1024 * 1024;
>
> int main() {
>         char *area = mmap(NULL, size, PROT_READ | PROT_WRITE,
>                           MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0);
>
>         if (area == MAP_FAILED) {
>                 perror("mmap() failed\n");
>                 return -1;
>         }
>
>         /* Populate a single page such that we get an anon_vma. */
>         *area = 0;
>
>         /* Enable KSM. */
>         madvise(area, size, MADV_MERGEABLE);
>         pause();
>         return 0;
> }
>
> $ ./ksm-sparse  &
> $ echo 1 > /sys/kernel/mm/ksm/run
>
> ksmd goes to 100% for quite a long time.
>
> Now imagine if a cloud user spins up a couple of these programs.
>
> KSM in the system is essentially deadlocked not able to deduplicate
> anything of value.
>
> @Pedro, can you incorporate all that in the patch description? 

Ok, I will send a v5 with a better changelog and patch description.
Thanks!