[v2] ksm: perform a range-walk to jump over holes in break_ksm

[PATCH v2 1/3] Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk"

Posted by Pedro Demarchi Gomes 3 months, 1 week ago

This reverts commit e317a8d8b4f600fc7ec9725e26417030ee594f52 and changes
function break_ksm_pmd_entry() to use folios.

This reverts break_ksm() to use walk_page_range_vma() instead of
folio_walk_start().
This will make it easier to later modify break_ksm() to perform a proper
range walk.

Suggested-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
---
 mm/ksm.c | 63 ++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 47 insertions(+), 16 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 4f672f4f2140..922d2936e206 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -607,6 +607,47 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
 	return atomic_read(&mm->mm_users) == 0;
 }
 
+static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
+			struct mm_walk *walk)
+{
+	struct folio *folio = NULL;
+	spinlock_t *ptl;
+	pte_t *pte;
+	pte_t ptent;
+	int ret;
+
+	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+	if (!pte)
+		return 0;
+	ptent = ptep_get(pte);
+	if (pte_present(ptent)) {
+		folio = vm_normal_folio(walk->vma, addr, ptent);
+	} else if (!pte_none(ptent)) {
+		swp_entry_t entry = pte_to_swp_entry(ptent);
+
+		/*
+		 * As KSM pages remain KSM pages until freed, no need to wait
+		 * here for migration to end.
+		 */
+		if (is_migration_entry(entry))
+			folio = pfn_swap_entry_folio(entry);
+	}
+	/* return 1 if the page is an normal ksm page or KSM-placed zero page */
+	ret = (folio && folio_test_ksm(folio)) || is_ksm_zero_pte(ptent);
+	pte_unmap_unlock(pte, ptl);
+	return ret;
+}
+
+static const struct mm_walk_ops break_ksm_ops = {
+	.pmd_entry = break_ksm_pmd_entry,
+	.walk_lock = PGWALK_RDLOCK,
+};
+
+static const struct mm_walk_ops break_ksm_lock_vma_ops = {
+	.pmd_entry = break_ksm_pmd_entry,
+	.walk_lock = PGWALK_WRLOCK,
+};
+
 /*
  * We use break_ksm to break COW on a ksm page by triggering unsharing,
  * such that the ksm page will get replaced by an exclusive anonymous page.
@@ -623,26 +664,16 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
 static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
 {
 	vm_fault_t ret = 0;
-
-	if (lock_vma)
-		vma_start_write(vma);
+	const struct mm_walk_ops *ops = lock_vma ?
+				&break_ksm_lock_vma_ops : &break_ksm_ops;
 
 	do {
-		bool ksm_page = false;
-		struct folio_walk fw;
-		struct folio *folio;
+		int ksm_page;
 
 		cond_resched();
-		folio = folio_walk_start(&fw, vma, addr,
-					 FW_MIGRATION | FW_ZEROPAGE);
-		if (folio) {
-			/* Small folio implies FW_LEVEL_PTE. */
-			if (!folio_test_large(folio) &&
-			    (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte)))
-				ksm_page = true;
-			folio_walk_end(&fw, vma);
-		}
-
+		ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
+		if (WARN_ON_ONCE(ksm_page < 0))
+			return ksm_page;
 		if (!ksm_page)
 			return 0;
 		ret = handle_mm_fault(vma, addr,
-- 
2.43.0

Re: [PATCH v2 1/3] Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk"

Posted by David Hildenbrand (Red Hat) 3 months, 1 week ago

On 31.10.25 18:46, Pedro Demarchi Gomes wrote:
> This reverts commit e317a8d8b4f600fc7ec9725e26417030ee594f52 and changes
> function break_ksm_pmd_entry() to use folios.
> 
> This reverts break_ksm() to use walk_page_range_vma() instead of
> folio_walk_start().
> This will make it easier to later modify break_ksm() to perform a proper
> range walk.
> 
> Suggested-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> ---
>   mm/ksm.c | 63 ++++++++++++++++++++++++++++++++++++++++++--------------
>   1 file changed, 47 insertions(+), 16 deletions(-)
> 
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 4f672f4f2140..922d2936e206 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -607,6 +607,47 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
>   	return atomic_read(&mm->mm_users) == 0;
>   }
>   
> +static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
> +			struct mm_walk *walk)
> +{
> +	struct folio *folio = NULL;
> +	spinlock_t *ptl;
> +	pte_t *pte;
> +	pte_t ptent;
> +	int ret;
> +
> +	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
> +	if (!pte)
> +		return 0;
> +	ptent = ptep_get(pte);
> +	if (pte_present(ptent)) {
> +		folio = vm_normal_folio(walk->vma, addr, ptent);
> +	} else if (!pte_none(ptent)) {
> +		swp_entry_t entry = pte_to_swp_entry(ptent);
> +
> +		/*
> +		 * As KSM pages remain KSM pages until freed, no need to wait
> +		 * here for migration to end.
> +		 */
> +		if (is_migration_entry(entry))
> +			folio = pfn_swap_entry_folio(entry);
> +	}
> +	/* return 1 if the page is an normal ksm page or KSM-placed zero page */
> +	ret = (folio && folio_test_ksm(folio)) || is_ksm_zero_pte(ptent);

Staring again, we should really call is_ksm_zero_pte() only if we know 
the folio is present.

It's not super dangerous in the old code (because we would only look at 
present an migration entries), but now you are making it possible to 
call it on even more non-present ptes.

With that handled

Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>

-- 
Cheers

David

Re: [PATCH v2 1/3] Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk"

Posted by Pedro Demarchi Gomes 3 months ago

On Mon, Nov 03, 2025 at 06:00:08PM +0100, David Hildenbrand (Red Hat) wrote:
> On 31.10.25 18:46, Pedro Demarchi Gomes wrote:
> > This reverts commit e317a8d8b4f600fc7ec9725e26417030ee594f52 and changes
> > function break_ksm_pmd_entry() to use folios.
> > 
> > This reverts break_ksm() to use walk_page_range_vma() instead of
> > folio_walk_start().
> > This will make it easier to later modify break_ksm() to perform a proper
> > range walk.
> > 
> > Suggested-by: David Hildenbrand <david@redhat.com>
> > Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
> > ---
> >   mm/ksm.c | 63 ++++++++++++++++++++++++++++++++++++++++++--------------
> >   1 file changed, 47 insertions(+), 16 deletions(-)
> > 
> > diff --git a/mm/ksm.c b/mm/ksm.c
> > index 4f672f4f2140..922d2936e206 100644
> > --- a/mm/ksm.c
> > +++ b/mm/ksm.c
> > @@ -607,6 +607,47 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
> >   	return atomic_read(&mm->mm_users) == 0;
> >   }
> > +static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
> > +			struct mm_walk *walk)
> > +{
> > +	struct folio *folio = NULL;
> > +	spinlock_t *ptl;
> > +	pte_t *pte;
> > +	pte_t ptent;
> > +	int ret;
> > +
> > +	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
> > +	if (!pte)
> > +		return 0;
> > +	ptent = ptep_get(pte);
> > +	if (pte_present(ptent)) {
> > +		folio = vm_normal_folio(walk->vma, addr, ptent);
> > +	} else if (!pte_none(ptent)) {
> > +		swp_entry_t entry = pte_to_swp_entry(ptent);
> > +
> > +		/*
> > +		 * As KSM pages remain KSM pages until freed, no need to wait
> > +		 * here for migration to end.
> > +		 */
> > +		if (is_migration_entry(entry))
> > +			folio = pfn_swap_entry_folio(entry);
> > +	}
> > +	/* return 1 if the page is an normal ksm page or KSM-placed zero page */
> > +	ret = (folio && folio_test_ksm(folio)) || is_ksm_zero_pte(ptent);
> 
> Staring again, we should really call is_ksm_zero_pte() only if we know the
> folio is present.
> 
> It's not super dangerous in the old code (because we would only look at
> present an migration entries), but now you are making it possible to call it
> on even more non-present ptes.
> 

IIUC vm_normal_folio will return NULL in the case of a ksm zero pte, so
we can not do
	found = folio && (folio_test_ksm(folio) || is_ksm_zero_pte(pte))
because it will always be false for a ksm zero pte.
So we should do 
	found = (folio && folio_test_ksm(folio)) || (pte_present(ptent) 
		&& is_ksm_zero_pte(ptent));
since if the pte is present and is a zero pte we can guarantee that
the folio is present.

Sorry if I am missing something.


> -- 
> Cheers
> 
> David
>

Re: [PATCH v2 1/3] Revert "mm/ksm: convert break_ksm() from walk_page_range_vma() to folio_walk"

Posted by David Hildenbrand (Red Hat) 3 months ago

On 05.11.25 14:28, Pedro Demarchi Gomes wrote:
> On Mon, Nov 03, 2025 at 06:00:08PM +0100, David Hildenbrand (Red Hat) wrote:
>> On 31.10.25 18:46, Pedro Demarchi Gomes wrote:
>>> This reverts commit e317a8d8b4f600fc7ec9725e26417030ee594f52 and changes
>>> function break_ksm_pmd_entry() to use folios.
>>>
>>> This reverts break_ksm() to use walk_page_range_vma() instead of
>>> folio_walk_start().
>>> This will make it easier to later modify break_ksm() to perform a proper
>>> range walk.
>>>
>>> Suggested-by: David Hildenbrand <david@redhat.com>
>>> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com>
>>> ---
>>>    mm/ksm.c | 63 ++++++++++++++++++++++++++++++++++++++++++--------------
>>>    1 file changed, 47 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/mm/ksm.c b/mm/ksm.c
>>> index 4f672f4f2140..922d2936e206 100644
>>> --- a/mm/ksm.c
>>> +++ b/mm/ksm.c
>>> @@ -607,6 +607,47 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
>>>    	return atomic_read(&mm->mm_users) == 0;
>>>    }
>>> +static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
>>> +			struct mm_walk *walk)
>>> +{
>>> +	struct folio *folio = NULL;
>>> +	spinlock_t *ptl;
>>> +	pte_t *pte;
>>> +	pte_t ptent;
>>> +	int ret;
>>> +
>>> +	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
>>> +	if (!pte)
>>> +		return 0;
>>> +	ptent = ptep_get(pte);
>>> +	if (pte_present(ptent)) {
>>> +		folio = vm_normal_folio(walk->vma, addr, ptent);
>>> +	} else if (!pte_none(ptent)) {
>>> +		swp_entry_t entry = pte_to_swp_entry(ptent);
>>> +
>>> +		/*
>>> +		 * As KSM pages remain KSM pages until freed, no need to wait
>>> +		 * here for migration to end.
>>> +		 */
>>> +		if (is_migration_entry(entry))
>>> +			folio = pfn_swap_entry_folio(entry);
>>> +	}
>>> +	/* return 1 if the page is an normal ksm page or KSM-placed zero page */
>>> +	ret = (folio && folio_test_ksm(folio)) || is_ksm_zero_pte(ptent);
>>
>> Staring again, we should really call is_ksm_zero_pte() only if we know the
>> folio is present.
>>
>> It's not super dangerous in the old code (because we would only look at
>> present an migration entries), but now you are making it possible to call it
>> on even more non-present ptes.
>>
> 
> IIUC vm_normal_folio will return NULL in the case of a ksm zero pte, so
> we can not do
> 	found = folio && (folio_test_ksm(folio) || is_ksm_zero_pte(pte))
> because it will always be false for a ksm zero pte.
> So we should do
> 	found = (folio && folio_test_ksm(folio)) || (pte_present(ptent)
> 		&& is_ksm_zero_pte(ptent));
> since if the pte is present and is a zero pte we can guarantee that
> the folio is present.

Yes exactly.


-- 
Cheers

David