[v5] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

[PATCH v5] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by Pedro Demarchi Gomes 3 months, 2 weeks ago

Currently, scan_get_next_rmap_item() walks every page address in a VMA
to locate mergeable pages. This becomes highly inefficient when scanning
large virtual memory areas that contain mostly unmapped regions, causing
ksmd to use large amount of cpu without deduplicating much pages.

This patch replaces the per-address lookup with a range walk using
walk_page_range(). The range walker allows KSM to skip over entire
unmapped holes in a VMA, avoiding unnecessary lookups.
This problem was previously discussed in [1].

Consider the following test program which creates a 32 TiB mapping in
the virtual address space but only populates a single page:

#include <unistd.h>
#include <stdio.h>
#include <sys/mman.h>

/* 32 TiB */
const size_t size = 32ul * 1024 * 1024 * 1024 * 1024;

int main() {
        char *area = mmap(NULL, size, PROT_READ | PROT_WRITE,
                          MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0);

        if (area == MAP_FAILED) {
                perror("mmap() failed\n");
                return -1;
        }

        /* Populate a single page such that we get an anon_vma. */
        *area = 0;

        /* Enable KSM. */
        madvise(area, size, MADV_MERGEABLE);
        pause();
        return 0;
}

$ ./ksm-sparse  &
$ echo 1 > /sys/kernel/mm/ksm/run 

Without this patch ksmd uses 100% of the cpu for a long time (more then
1 hour in my test machine) scanning all the 32 TiB virtual address space
that contain only one mapped page. This makes ksmd essentially deadlocked
not able to deduplicate anything of value.
With this patch ksmd walks only the one mapped page and skips the rest of
the 32 TiB virtual address space, making the scan fast using little cpu.

[1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/

---
v5:
  - Improve patch description

v4: https://lore.kernel.org/linux-mm/20251022153059.22763-1-pedrodemargomes@gmail.com/
  - Make minimal changes to replace folio_walk by walk_page_range_vma

v3: https://lore.kernel.org/all/20251016012236.4189-1-pedrodemargomes@gmail.com/
  - Treat THPs in ksm_pmd_entry
  - Update ksm_scan.address outside walk_page_range
  - Change goto to while loop

v2: https://lore.kernel.org/all/20251014151126.87589-1-pedrodemargomes@gmail.com/
  - Use pmd_entry to walk page range
  - Use cond_resched inside pmd_entry()
  - walk_page_range returns page+folio

v1: https://lore.kernel.org/all/20251014055828.124522-1-pedrodemargomes@gmail.com/

Reported-by: craftfever <craftfever@airmail.cc>
Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
Suggested-by: David Hildenbrand <david@redhat.com>
Co-developed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Fixes: 31dbd01f3143 ("ksm: Kernel SamePage Merging")
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
---
 mm/ksm.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 104 insertions(+), 9 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 3aed0478fdce..4f672f4f2140 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2455,6 +2455,95 @@ static bool should_skip_rmap_item(struct folio *folio,
 	return true;
 }
 
+struct ksm_next_page_arg {
+	struct folio *folio;
+	struct page *page;
+	unsigned long addr;
+};
+
+static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
+		struct mm_walk *walk)
+{
+	struct ksm_next_page_arg *private = walk->private;
+	struct vm_area_struct *vma = walk->vma;
+	pte_t *start_ptep = NULL, *ptep, pte;
+	struct mm_struct *mm = walk->mm;
+	struct folio *folio;
+	struct page *page;
+	spinlock_t *ptl;
+	pmd_t pmd;
+
+	if (ksm_test_exit(mm))
+		return 0;
+
+	cond_resched();
+
+	pmd = pmdp_get_lockless(pmdp);
+	if (!pmd_present(pmd))
+		return 0;
+
+	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) {
+		ptl = pmd_lock(mm, pmdp);
+		pmd = pmdp_get(pmdp);
+
+		if (!pmd_present(pmd)) {
+			goto not_found_unlock;
+		} else if (pmd_leaf(pmd)) {
+			page = vm_normal_page_pmd(vma, addr, pmd);
+			if (!page)
+				goto not_found_unlock;
+			folio = page_folio(page);
+
+			if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+				goto not_found_unlock;
+
+			page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
+			goto found_unlock;
+		}
+		spin_unlock(ptl);
+	}
+
+	start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+	if (!start_ptep)
+		return 0;
+
+	for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
+		pte = ptep_get(ptep);
+
+		if (!pte_present(pte))
+			continue;
+
+		page = vm_normal_page(vma, addr, pte);
+		if (!page)
+			continue;
+		folio = page_folio(page);
+
+		if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+			continue;
+		goto found_unlock;
+	}
+
+not_found_unlock:
+	spin_unlock(ptl);
+	if (start_ptep)
+		pte_unmap(start_ptep);
+	return 0;
+found_unlock:
+	folio_get(folio);
+	spin_unlock(ptl);
+	if (start_ptep)
+		pte_unmap(start_ptep);
+	private->page = page;
+	private->folio = folio;
+	private->addr = addr;
+	return 1;
+}
+
+static struct mm_walk_ops ksm_next_page_ops = {
+	.pmd_entry = ksm_next_page_pmd_entry,
+	.walk_lock = PGWALK_RDLOCK,
+};
+
 static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
 {
 	struct mm_struct *mm;
@@ -2542,21 +2631,27 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
 			ksm_scan.address = vma->vm_end;
 
 		while (ksm_scan.address < vma->vm_end) {
+			struct ksm_next_page_arg ksm_next_page_arg;
 			struct page *tmp_page = NULL;
-			struct folio_walk fw;
 			struct folio *folio;
 
 			if (ksm_test_exit(mm))
 				break;
 
-			folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
-			if (folio) {
-				if (!folio_is_zone_device(folio) &&
-				     folio_test_anon(folio)) {
-					folio_get(folio);
-					tmp_page = fw.page;
-				}
-				folio_walk_end(&fw, vma);
+			int found;
+
+			found = walk_page_range_vma(vma, ksm_scan.address,
+						    vma->vm_end,
+						    &ksm_next_page_ops,
+						    &ksm_next_page_arg);
+
+			if (found > 0) {
+				folio = ksm_next_page_arg.folio;
+				tmp_page = ksm_next_page_arg.page;
+				ksm_scan.address = ksm_next_page_arg.addr;
+			} else {
+				VM_WARN_ON_ONCE(found < 0);
+				ksm_scan.address = vma->vm_end - PAGE_SIZE;
 			}
 
 			if (tmp_page) {
-- 
2.43.0

Re: [PATCH v5] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by David Hildenbrand 3 months, 2 weeks ago

On 23.10.25 05:58, Pedro Demarchi Gomes wrote:
> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions, causing
> ksmd to use large amount of cpu without deduplicating much pages.
> 
> This patch replaces the per-address lookup with a range walk using
> walk_page_range(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
> This problem was previously discussed in [1].
> 
> Consider the following test program which creates a 32 TiB mapping in
> the virtual address space but only populates a single page:
> 
> #include <unistd.h>
> #include <stdio.h>
> #include <sys/mman.h>
> 
> /* 32 TiB */
> const size_t size = 32ul * 1024 * 1024 * 1024 * 1024;
> 
> int main() {
>          char *area = mmap(NULL, size, PROT_READ | PROT_WRITE,
>                            MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0);
> 
>          if (area == MAP_FAILED) {
>                  perror("mmap() failed\n");
>                  return -1;
>          }
> 
>          /* Populate a single page such that we get an anon_vma. */
>          *area = 0;
> 
>          /* Enable KSM. */
>          madvise(area, size, MADV_MERGEABLE);
>          pause();
>          return 0;
> }
> 
> $ ./ksm-sparse  &
> $ echo 1 > /sys/kernel/mm/ksm/run
> 
> Without this patch ksmd uses 100% of the cpu for a long time (more then
> 1 hour in my test machine) scanning all the 32 TiB virtual address space
> that contain only one mapped page. This makes ksmd essentially deadlocked
> not able to deduplicate anything of value.
> With this patch ksmd walks only the one mapped page and skips the rest of
> the 32 TiB virtual address space, making the scan fast using little cpu.
> 
> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> 
> ---
> v5:
>    - Improve patch description
> 
> v4: https://lore.kernel.org/linux-mm/20251022153059.22763-1-pedrodemargomes@gmail.com/
>    - Make minimal changes to replace folio_walk by walk_page_range_vma
> 
> v3: https://lore.kernel.org/all/20251016012236.4189-1-pedrodemargomes@gmail.com/
>    - Treat THPs in ksm_pmd_entry
>    - Update ksm_scan.address outside walk_page_range
>    - Change goto to while loop
> 
> v2: https://lore.kernel.org/all/20251014151126.87589-1-pedrodemargomes@gmail.com/
>    - Use pmd_entry to walk page range
>    - Use cond_resched inside pmd_entry()
>    - walk_page_range returns page+folio
> 
> v1: https://lore.kernel.org/all/20251014055828.124522-1-pedrodemargomes@gmail.com/
> 
> Reported-by: craftfever <craftfever@airmail.cc>
> Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
> Suggested-by: David Hildenbrand <david@redhat.com>
> Co-developed-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> Fixes: 31dbd01f3143 ("ksm: Kernel SamePage Merging")
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>

I think we really want to

	Cc: stable@vger.kernel.org

Andrew can do that when applying.

Acked-by: David Hildenbrand <david@redhat.com>


As a note, we have similar code that should probably be doing a range 
walk instead: unmerge_ksm_pages()->break_ksm().

It can be triggered on a range through unmerge_ksm_pages(), which gets 
called from:

* ksm_madvise() through madvise(MADV_UNMERGEABLE).  There are not a lot 
of users of that function.

* __ksm_del_vma() through ksm_del_vmas(). Effectively called when 
disabling KSM for a process either through the sysctl or from s390x gmap 
code when enabling storage keys for a VM.

In both cases, it's not ksmd that's blocked, it's just that the 
operation (trigger by the app) takes longer.

So both are not as critical as this thing here, but likely we should 
take care of it at some point.

Interestingly, I converted that from a walk_page_range_vma() to 
folio_walk_start() after converting it from follow_page() to 
walk_page_range_vma().

But we never did a range walk, we just walked individual addresses, 
because that's what break_ksm() does.

We could effectively revert e317a8d8b4f600fc7ec9725e26417030ee594f52 and 
adjust it to perform an actual range walk by passing a range to break_ksm().

-- 
Cheers

David / dhildenb

Re: [PATCH v5] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by Pedro Demarchi Gomes 3 months, 2 weeks ago

On 10/23/25 07:13, David Hildenbrand wrote:
> As a note, we have similar code that should probably be doing a range walk instead: unmerge_ksm_pages()->break_ksm().
>
> It can be triggered on a range through unmerge_ksm_pages(), which gets called from:
>
> * ksm_madvise() through madvise(MADV_UNMERGEABLE).  There are not a lot of users of that function.
>
> * __ksm_del_vma() through ksm_del_vmas(). Effectively called when disabling KSM for a process either through the sysctl or from s390x gmap code when enabling storage keys for a VM.
>
> In both cases, it's not ksmd that's blocked, it's just that the operation (trigger by the app) takes longer.
>
> So both are not as critical as this thing here, but likely we should take care of it at some point.
>
> Interestingly, I converted that from a walk_page_range_vma() to folio_walk_start() after converting it from follow_page() to walk_page_range_vma().
>
> But we never did a range walk, we just walked individual addresses, because that's what break_ksm() does.
>
> We could effectively revert e317a8d8b4f600fc7ec9725e26417030ee594f52 and adjust it to perform an actual range walk by passing a range to break_ksm(). 


Thanks for letting me know. I will send a patch fixing this issue.

Re: [PATCH v5] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by craftfever 3 months, 2 weeks ago


Pedro Demarchi Gomes wrote:
> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions, causing
> ksmd to use large amount of cpu without deduplicating much pages.
> 
> This patch replaces the per-address lookup with a range walk using
> walk_page_range(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
> This problem was previously discussed in [1].
> 
> Consider the following test program which creates a 32 TiB mapping in
> the virtual address space but only populates a single page:
> 
> #include <unistd.h>
> #include <stdio.h>
> #include <sys/mman.h>
> 
> /* 32 TiB */
> const size_t size = 32ul * 1024 * 1024 * 1024 * 1024;
> 
> int main() {
>          char *area = mmap(NULL, size, PROT_READ | PROT_WRITE,
>                            MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0);
> 
>          if (area == MAP_FAILED) {
>                  perror("mmap() failed\n");
>                  return -1;
>          }
> 
>          /* Populate a single page such that we get an anon_vma. */
>          *area = 0;
> 
>          /* Enable KSM. */
>          madvise(area, size, MADV_MERGEABLE);
>          pause();
>          return 0;
> }
> 
> $ ./ksm-sparse  &
> $ echo 1 > /sys/kernel/mm/ksm/run
> 
> Without this patch ksmd uses 100% of the cpu for a long time (more then
> 1 hour in my test machine) scanning all the 32 TiB virtual address space
> that contain only one mapped page. This makes ksmd essentially deadlocked
> not able to deduplicate anything of value.
> With this patch ksmd walks only the one mapped page and skips the rest of
> the 32 TiB virtual address space, making the scan fast using little cpu.
> 
> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> 
> ---
> v5:
>    - Improve patch description
> 
> v4: https://lore.kernel.org/linux-mm/20251022153059.22763-1-pedrodemargomes@gmail.com/
>    - Make minimal changes to replace folio_walk by walk_page_range_vma
> 
> v3: https://lore.kernel.org/all/20251016012236.4189-1-pedrodemargomes@gmail.com/
>    - Treat THPs in ksm_pmd_entry
>    - Update ksm_scan.address outside walk_page_range
>    - Change goto to while loop
> 
> v2: https://lore.kernel.org/all/20251014151126.87589-1-pedrodemargomes@gmail.com/
>    - Use pmd_entry to walk page range
>    - Use cond_resched inside pmd_entry()
>    - walk_page_range returns page+folio
> 
> v1: https://lore.kernel.org/all/20251014055828.124522-1-pedrodemargomes@gmail.com/
> 
> Reported-by: craftfever <craftfever@airmail.cc>
> Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
> Suggested-by: David Hildenbrand <david@redhat.com>
> Co-developed-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> Fixes: 31dbd01f3143 ("ksm: Kernel SamePage Merging")
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> ---
>   mm/ksm.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
>   1 file changed, 104 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 3aed0478fdce..4f672f4f2140 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -2455,6 +2455,95 @@ static bool should_skip_rmap_item(struct folio *folio,
>   	return true;
>   }
>   
> +struct ksm_next_page_arg {
> +	struct folio *folio;
> +	struct page *page;
> +	unsigned long addr;
> +};
> +
> +static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
> +		struct mm_walk *walk)
> +{
> +	struct ksm_next_page_arg *private = walk->private;
> +	struct vm_area_struct *vma = walk->vma;
> +	pte_t *start_ptep = NULL, *ptep, pte;
> +	struct mm_struct *mm = walk->mm;
> +	struct folio *folio;
> +	struct page *page;
> +	spinlock_t *ptl;
> +	pmd_t pmd;
> +
> +	if (ksm_test_exit(mm))
> +		return 0;
> +
> +	cond_resched();
> +
> +	pmd = pmdp_get_lockless(pmdp);
> +	if (!pmd_present(pmd))
> +		return 0;
> +
> +	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) {
> +		ptl = pmd_lock(mm, pmdp);
> +		pmd = pmdp_get(pmdp);
> +
> +		if (!pmd_present(pmd)) {
> +			goto not_found_unlock;
> +		} else if (pmd_leaf(pmd)) {
> +			page = vm_normal_page_pmd(vma, addr, pmd);
> +			if (!page)
> +				goto not_found_unlock;
> +			folio = page_folio(page);
> +
> +			if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> +				goto not_found_unlock;
> +
> +			page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
> +			goto found_unlock;
> +		}
> +		spin_unlock(ptl);
> +	}
> +
> +	start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
> +	if (!start_ptep)
> +		return 0;
> +
> +	for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
> +		pte = ptep_get(ptep);
> +
> +		if (!pte_present(pte))
> +			continue;
> +
> +		page = vm_normal_page(vma, addr, pte);
> +		if (!page)
> +			continue;
> +		folio = page_folio(page);
> +
> +		if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> +			continue;
> +		goto found_unlock;
> +	}
> +
> +not_found_unlock:
> +	spin_unlock(ptl);
> +	if (start_ptep)
> +		pte_unmap(start_ptep);
> +	return 0;
> +found_unlock:
> +	folio_get(folio);
> +	spin_unlock(ptl);
> +	if (start_ptep)
> +		pte_unmap(start_ptep);
> +	private->page = page;
> +	private->folio = folio;
> +	private->addr = addr;
> +	return 1;
> +}
> +
> +static struct mm_walk_ops ksm_next_page_ops = {
> +	.pmd_entry = ksm_next_page_pmd_entry,
> +	.walk_lock = PGWALK_RDLOCK,
> +};
> +
>   static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>   {
>   	struct mm_struct *mm;
> @@ -2542,21 +2631,27 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>   			ksm_scan.address = vma->vm_end;
>   
>   		while (ksm_scan.address < vma->vm_end) {
> +			struct ksm_next_page_arg ksm_next_page_arg;
>   			struct page *tmp_page = NULL;
> -			struct folio_walk fw;
>   			struct folio *folio;
>   
>   			if (ksm_test_exit(mm))
>   				break;
>   
> -			folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
> -			if (folio) {
> -				if (!folio_is_zone_device(folio) &&
> -				     folio_test_anon(folio)) {
> -					folio_get(folio);
> -					tmp_page = fw.page;
> -				}
> -				folio_walk_end(&fw, vma);
> +			int found;
> +
> +			found = walk_page_range_vma(vma, ksm_scan.address,
> +						    vma->vm_end,
> +						    &ksm_next_page_ops,
> +						    &ksm_next_page_arg);
> +
> +			if (found > 0) {
> +				folio = ksm_next_page_arg.folio;
> +				tmp_page = ksm_next_page_arg.page;
> +				ksm_scan.address = ksm_next_page_arg.addr;
> +			} else {
> +				VM_WARN_ON_ONCE(found < 0);
> +				ksm_scan.address = vma->vm_end - PAGE_SIZE;
>   			}
>   
>   			if (tmp_page) {

Thank you, it works magnificently, very pleasent. Very low CPU-consuming 
and effective, no crashes. Is this a final version?

Re: [PATCH v5] ksm: use range-walk function to jump over holes in scan_get_next_rmap_item

Posted by David Hildenbrand 3 months, 2 weeks ago

>>    				break;
>>    
>> -			folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
>> -			if (folio) {
>> -				if (!folio_is_zone_device(folio) &&
>> -				     folio_test_anon(folio)) {
>> -					folio_get(folio);
>> -					tmp_page = fw.page;
>> -				}
>> -				folio_walk_end(&fw, vma);
>> +			int found;
>> +
>> +			found = walk_page_range_vma(vma, ksm_scan.address,
>> +						    vma->vm_end,
>> +						    &ksm_next_page_ops,
>> +						    &ksm_next_page_arg);
>> +
>> +			if (found > 0) {
>> +				folio = ksm_next_page_arg.folio;
>> +				tmp_page = ksm_next_page_arg.page;
>> +				ksm_scan.address = ksm_next_page_arg.addr;
>> +			} else {
>> +				VM_WARN_ON_ONCE(found < 0);
>> +				ksm_scan.address = vma->vm_end - PAGE_SIZE;
>>    			}
>>    
>>    			if (tmp_page) {
> 
> Thank you, it works magnificently, very pleasent. Very low CPU-consuming
> and effective, no crashes. Is this a final version?

If there is no further feedback, yes.

-- 
Cheers

David / dhildenb