[RFC PATCH 09/12] khugepaged: Introduce vma_collapse_anon_folio()

Dev Jain posted 12 patches 1 year, 1 month ago
There is a newer version of this series
[RFC PATCH 09/12] khugepaged: Introduce vma_collapse_anon_folio()
Posted by Dev Jain 1 year, 1 month ago
In contrast to PMD-collapse, we do not need to operate on two levels of pagetable
simultaneously. Therefore, downgrade the mmap lock from write to read mode. Still
take the anon_vma lock in exclusive mode so as to not waste time in the rmap path,
which is anyways going to fail since the PTEs are going to be changed. Under the PTL,
copy page contents, clear the PTEs, remove folio pins, and (try to) unmap the
old folios. Set the PTEs to the new folio using the set_ptes() API.

Signed-off-by: Dev Jain <dev.jain@arm.com>
---
Note: I have been trying hard to get rid of the locks in here: we still are
taking the PTL around the page copying; dropping the PTL and taking it after
the copying should lead to a deadlock, for example:
khugepaged						madvise(MADV_COLD)
folio_lock()						lock(ptl)
lock(ptl)						folio_lock()

We can create a locked folio list, altogether drop both the locks, take the PTL,
do everything which __collapse_huge_page_isolate() does *except* the isolation and
again try locking folios, but then it will reduce efficiency of khugepaged
and almost looks like a forced solution :)
Please note the following discussion if anyone is interested:
https://lore.kernel.org/all/66bb7496-a445-4ad7-8e56-4f2863465c54@arm.com/
(Apologies for not CCing the mailing list from the start)

 mm/khugepaged.c | 108 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 87 insertions(+), 21 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 88beebef773e..8040b130e677 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -714,24 +714,28 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
 						struct vm_area_struct *vma,
 						unsigned long address,
 						spinlock_t *ptl,
-						struct list_head *compound_pagelist)
+						struct list_head *compound_pagelist, int order)
 {
 	struct folio *src, *tmp;
 	pte_t *_pte;
 	pte_t pteval;
 
-	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+	for (_pte = pte; _pte < pte + (1UL << order);
 	     _pte++, address += PAGE_SIZE) {
 		pteval = ptep_get(_pte);
 		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
 			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
 			if (is_zero_pfn(pte_pfn(pteval))) {
-				/*
-				 * ptl mostly unnecessary.
-				 */
-				spin_lock(ptl);
-				ptep_clear(vma->vm_mm, address, _pte);
-				spin_unlock(ptl);
+				if (order == HPAGE_PMD_ORDER) {
+					/*
+					* ptl mostly unnecessary.
+					*/
+					spin_lock(ptl);
+					ptep_clear(vma->vm_mm, address, _pte);
+					spin_unlock(ptl);
+				} else {
+					ptep_clear(vma->vm_mm, address, _pte);
+				}
 				ksm_might_unmap_zero_page(vma->vm_mm, pteval);
 			}
 		} else {
@@ -740,15 +744,20 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
 			src = page_folio(src_page);
 			if (!folio_test_large(src))
 				release_pte_folio(src);
-			/*
-			 * ptl mostly unnecessary, but preempt has to
-			 * be disabled to update the per-cpu stats
-			 * inside folio_remove_rmap_pte().
-			 */
-			spin_lock(ptl);
-			ptep_clear(vma->vm_mm, address, _pte);
-			folio_remove_rmap_pte(src, src_page, vma);
-			spin_unlock(ptl);
+			if (order == HPAGE_PMD_ORDER) {
+				/*
+				* ptl mostly unnecessary, but preempt has to
+				* be disabled to update the per-cpu stats
+				* inside folio_remove_rmap_pte().
+				*/
+				spin_lock(ptl);
+				ptep_clear(vma->vm_mm, address, _pte);
+				folio_remove_rmap_pte(src, src_page, vma);
+				spin_unlock(ptl);
+			} else {
+				ptep_clear(vma->vm_mm, address, _pte);
+				folio_remove_rmap_pte(src, src_page, vma);
+			}
 			free_page_and_swap_cache(src_page);
 		}
 	}
@@ -807,7 +816,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
 static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
 		pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
 		unsigned long address, spinlock_t *ptl,
-		struct list_head *compound_pagelist)
+		struct list_head *compound_pagelist, int order)
 {
 	unsigned int i;
 	int result = SCAN_SUCCEED;
@@ -815,7 +824,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
 	/*
 	 * Copying pages' contents is subject to memory poison at any iteration.
 	 */
-	for (i = 0; i < HPAGE_PMD_NR; i++) {
+	for (i = 0; i < (1 << order); i++) {
 		pte_t pteval = ptep_get(pte + i);
 		struct page *page = folio_page(folio, i);
 		unsigned long src_addr = address + i * PAGE_SIZE;
@@ -834,7 +843,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
 
 	if (likely(result == SCAN_SUCCEED))
 		__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
-						    compound_pagelist);
+						    compound_pagelist, order);
 	else
 		__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
 						 compound_pagelist, order);
@@ -1196,7 +1205,7 @@ static int vma_collapse_anon_folio_pmd(struct mm_struct *mm, unsigned long addre
 
 	result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
 					   vma, address, pte_ptl,
-					   &compound_pagelist);
+					   &compound_pagelist, HPAGE_PMD_ORDER);
 	pte_unmap(pte);
 	if (unlikely(result != SCAN_SUCCEED))
 		goto out_up_write;
@@ -1228,6 +1237,61 @@ static int vma_collapse_anon_folio_pmd(struct mm_struct *mm, unsigned long addre
 	return result;
 }
 
+/* Enter with mmap read lock */
+static int vma_collapse_anon_folio(struct mm_struct *mm, unsigned long address,
+		struct vm_area_struct *vma, struct collapse_control *cc, pmd_t *pmd,
+		struct folio *folio, int order)
+{
+	int result;
+	struct mmu_notifier_range range;
+	spinlock_t *pte_ptl;
+	LIST_HEAD(compound_pagelist);
+	pte_t *pte;
+	pte_t entry;
+	int nr_pages = folio_nr_pages(folio);
+
+	anon_vma_lock_write(vma->anon_vma);
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
+				address + (PAGE_SIZE << order));
+	mmu_notifier_invalidate_range_start(&range);
+
+	pte = pte_offset_map_lock(mm, pmd, address, &pte_ptl);
+	if (pte)
+		result = __collapse_huge_page_isolate(vma, address, pte, cc,
+						      &compound_pagelist, order);
+	else
+		result = SCAN_PMD_NULL;
+
+	if (unlikely(result != SCAN_SUCCEED))
+		goto out_up_read;
+
+	anon_vma_unlock_write(vma->anon_vma);
+
+	__folio_mark_uptodate(folio);
+	entry = mk_pte(&folio->page, vma->vm_page_prot);
+	entry = maybe_mkwrite(entry, vma);
+
+	result = __collapse_huge_page_copy(pte, folio, pmd, *pmd,
+					   vma, address, pte_ptl,
+					   &compound_pagelist, order);
+	if (unlikely(result != SCAN_SUCCEED))
+		goto out_up_read;
+
+	folio_ref_add(folio, nr_pages - 1);
+	folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
+	folio_add_lru_vma(folio, vma);
+	deferred_split_folio(folio, false);
+	set_ptes(mm, address, pte, entry, nr_pages);
+	update_mmu_cache_range(NULL, vma, address, pte, nr_pages);
+	pte_unmap_unlock(pte, pte_ptl);
+	mmu_notifier_invalidate_range_end(&range);
+	result = SCAN_SUCCEED;
+
+out_up_read:
+	mmap_read_unlock(mm);
+	return result;
+}
+
 static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 			      int referenced, int unmapped, int order,
 			      struct collapse_control *cc)
@@ -1276,6 +1340,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 
 	if (order == HPAGE_PMD_ORDER)
 		result = vma_collapse_anon_folio_pmd(mm, address, vma, cc, pmd, folio);
+	else
+		result = vma_collapse_anon_folio(mm, address, vma, cc, pmd, folio, order);
 
 	if (result == SCAN_SUCCEED)
 		folio = NULL;
-- 
2.30.2
Re: [RFC PATCH 09/12] khugepaged: Introduce vma_collapse_anon_folio()
Posted by Usama Arif 1 year, 1 month ago

On 16/12/2024 16:51, Dev Jain wrote:
> In contrast to PMD-collapse, we do not need to operate on two levels of pagetable
> simultaneously. Therefore, downgrade the mmap lock from write to read mode. Still
> take the anon_vma lock in exclusive mode so as to not waste time in the rmap path,
> which is anyways going to fail since the PTEs are going to be changed. Under the PTL,
> copy page contents, clear the PTEs, remove folio pins, and (try to) unmap the
> old folios. Set the PTEs to the new folio using the set_ptes() API.
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
> Note: I have been trying hard to get rid of the locks in here: we still are
> taking the PTL around the page copying; dropping the PTL and taking it after
> the copying should lead to a deadlock, for example:
> khugepaged						madvise(MADV_COLD)
> folio_lock()						lock(ptl)
> lock(ptl)						folio_lock()
> 
> We can create a locked folio list, altogether drop both the locks, take the PTL,
> do everything which __collapse_huge_page_isolate() does *except* the isolation and
> again try locking folios, but then it will reduce efficiency of khugepaged
> and almost looks like a forced solution :)
> Please note the following discussion if anyone is interested:
> https://lore.kernel.org/all/66bb7496-a445-4ad7-8e56-4f2863465c54@arm.com/
> (Apologies for not CCing the mailing list from the start)
> 
>  mm/khugepaged.c | 108 ++++++++++++++++++++++++++++++++++++++----------
>  1 file changed, 87 insertions(+), 21 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 88beebef773e..8040b130e677 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -714,24 +714,28 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
>  						struct vm_area_struct *vma,
>  						unsigned long address,
>  						spinlock_t *ptl,
> -						struct list_head *compound_pagelist)
> +						struct list_head *compound_pagelist, int order)
>  {
>  	struct folio *src, *tmp;
>  	pte_t *_pte;
>  	pte_t pteval;
>  
> -	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> +	for (_pte = pte; _pte < pte + (1UL << order);
>  	     _pte++, address += PAGE_SIZE) {
>  		pteval = ptep_get(_pte);
>  		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
>  			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
>  			if (is_zero_pfn(pte_pfn(pteval))) {
> -				/*
> -				 * ptl mostly unnecessary.
> -				 */
> -				spin_lock(ptl);
> -				ptep_clear(vma->vm_mm, address, _pte);
> -				spin_unlock(ptl);
> +				if (order == HPAGE_PMD_ORDER) {
> +					/*
> +					* ptl mostly unnecessary.
> +					*/
> +					spin_lock(ptl);
> +					ptep_clear(vma->vm_mm, address, _pte);
> +					spin_unlock(ptl);
> +				} else {
> +					ptep_clear(vma->vm_mm, address, _pte);
> +				}
>  				ksm_might_unmap_zero_page(vma->vm_mm, pteval);
>  			}
>  		} else {
> @@ -740,15 +744,20 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
>  			src = page_folio(src_page);
>  			if (!folio_test_large(src))
>  				release_pte_folio(src);
> -			/*
> -			 * ptl mostly unnecessary, but preempt has to
> -			 * be disabled to update the per-cpu stats
> -			 * inside folio_remove_rmap_pte().
> -			 */
> -			spin_lock(ptl);
> -			ptep_clear(vma->vm_mm, address, _pte);
> -			folio_remove_rmap_pte(src, src_page, vma);
> -			spin_unlock(ptl);
> +			if (order == HPAGE_PMD_ORDER) {
> +				/*
> +				* ptl mostly unnecessary, but preempt has to
> +				* be disabled to update the per-cpu stats
> +				* inside folio_remove_rmap_pte().
> +				*/
> +				spin_lock(ptl);
> +				ptep_clear(vma->vm_mm, address, _pte);
> +				folio_remove_rmap_pte(src, src_page, vma);
> +				spin_unlock(ptl);
> +			} else {
> +				ptep_clear(vma->vm_mm, address, _pte);
> +				folio_remove_rmap_pte(src, src_page, vma);
> +			}
>  			free_page_and_swap_cache(src_page);
>  		}
>  	}
> @@ -807,7 +816,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
>  static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
>  		pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
>  		unsigned long address, spinlock_t *ptl,
> -		struct list_head *compound_pagelist)
> +		struct list_head *compound_pagelist, int order)
>  {
>  	unsigned int i;
>  	int result = SCAN_SUCCEED;
> @@ -815,7 +824,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
>  	/*
>  	 * Copying pages' contents is subject to memory poison at any iteration.
>  	 */
> -	for (i = 0; i < HPAGE_PMD_NR; i++) {
> +	for (i = 0; i < (1 << order); i++) {
>  		pte_t pteval = ptep_get(pte + i);
>  		struct page *page = folio_page(folio, i);
>  		unsigned long src_addr = address + i * PAGE_SIZE;
> @@ -834,7 +843,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
>  
>  	if (likely(result == SCAN_SUCCEED))
>  		__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
> -						    compound_pagelist);
> +						    compound_pagelist, order);
>  	else
>  		__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
>  						 compound_pagelist, order);
> @@ -1196,7 +1205,7 @@ static int vma_collapse_anon_folio_pmd(struct mm_struct *mm, unsigned long addre
>  
>  	result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
>  					   vma, address, pte_ptl,
> -					   &compound_pagelist);
> +					   &compound_pagelist, HPAGE_PMD_ORDER);
>  	pte_unmap(pte);
>  	if (unlikely(result != SCAN_SUCCEED))
>  		goto out_up_write;
> @@ -1228,6 +1237,61 @@ static int vma_collapse_anon_folio_pmd(struct mm_struct *mm, unsigned long addre
>  	return result;
>  }
>  
> +/* Enter with mmap read lock */
> +static int vma_collapse_anon_folio(struct mm_struct *mm, unsigned long address,
> +		struct vm_area_struct *vma, struct collapse_control *cc, pmd_t *pmd,
> +		struct folio *folio, int order)
> +{
> +	int result;
> +	struct mmu_notifier_range range;
> +	spinlock_t *pte_ptl;
> +	LIST_HEAD(compound_pagelist);
> +	pte_t *pte;
> +	pte_t entry;
> +	int nr_pages = folio_nr_pages(folio);
> +
> +	anon_vma_lock_write(vma->anon_vma);
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
> +				address + (PAGE_SIZE << order));
> +	mmu_notifier_invalidate_range_start(&range);
> +
> +	pte = pte_offset_map_lock(mm, pmd, address, &pte_ptl);
> +	if (pte)
> +		result = __collapse_huge_page_isolate(vma, address, pte, cc,
> +						      &compound_pagelist, order);
> +	else
> +		result = SCAN_PMD_NULL;
> +
> +	if (unlikely(result != SCAN_SUCCEED))
> +		goto out_up_read;
> +
> +	anon_vma_unlock_write(vma->anon_vma);
> +
> +	__folio_mark_uptodate(folio);
> +	entry = mk_pte(&folio->page, vma->vm_page_prot);
> +	entry = maybe_mkwrite(entry, vma);
> +
> +	result = __collapse_huge_page_copy(pte, folio, pmd, *pmd,
> +					   vma, address, pte_ptl,
> +					   &compound_pagelist, order);
> +	if (unlikely(result != SCAN_SUCCEED))
> +		goto out_up_read;
> +
> +	folio_ref_add(folio, nr_pages - 1);
> +	folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
> +	folio_add_lru_vma(folio, vma);
> +	deferred_split_folio(folio, false);

Hi Dev,

You are adding the lower order folios to the deferred split queue,
but you havent changed the THP shrinker to take this into account.

At memory pressure you will be doing a lot of work checking the contents of
all mTHP pages which will be wasted unless you change the shrinker, something
like below (unbuilt, untested) might work:

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c89aed1510f1..f9586df40f67 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3788,7 +3788,7 @@ static bool thp_underused(struct folio *folio)
                kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
                if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
                        num_zero_pages++;
-                       if (num_zero_pages > khugepaged_max_ptes_none) {
+                       if (num_zero_pages > khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - folio_order(folio))) {
                                kunmap_local(kaddr);
                                return true;
                        }


The question is, do we want the shrinker to be run for lower order mTHPs? It can consume
a lot of CPU cycles and not be as useful as PMD order THPs. So instead of above, we could
disable THP shrinker for lower orders? 

> +	set_ptes(mm, address, pte, entry, nr_pages);
> +	update_mmu_cache_range(NULL, vma, address, pte, nr_pages);
> +	pte_unmap_unlock(pte, pte_ptl);
> +	mmu_notifier_invalidate_range_end(&range);
> +	result = SCAN_SUCCEED;
> +
> +out_up_read:
> +	mmap_read_unlock(mm);
> +	return result;
> +}
> +
>  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>  			      int referenced, int unmapped, int order,
>  			      struct collapse_control *cc)
> @@ -1276,6 +1340,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>  
>  	if (order == HPAGE_PMD_ORDER)
>  		result = vma_collapse_anon_folio_pmd(mm, address, vma, cc, pmd, folio);
> +	else
> +		result = vma_collapse_anon_folio(mm, address, vma, cc, pmd, folio, order);
>  
>  	if (result == SCAN_SUCCEED)
>  		folio = NULL;
Re: [RFC PATCH 09/12] khugepaged: Introduce vma_collapse_anon_folio()
Posted by Dev Jain 1 year, 1 month ago
On 06/01/25 3:47 pm, Usama Arif wrote:
>
> On 16/12/2024 16:51, Dev Jain wrote:
>> In contrast to PMD-collapse, we do not need to operate on two levels of pagetable
>> simultaneously. Therefore, downgrade the mmap lock from write to read mode. Still
>> take the anon_vma lock in exclusive mode so as to not waste time in the rmap path,
>> which is anyways going to fail since the PTEs are going to be changed. Under the PTL,
>> copy page contents, clear the PTEs, remove folio pins, and (try to) unmap the
>> old folios. Set the PTEs to the new folio using the set_ptes() API.
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>> Note: I have been trying hard to get rid of the locks in here: we still are
>> taking the PTL around the page copying; dropping the PTL and taking it after
>> the copying should lead to a deadlock, for example:
>> khugepaged						madvise(MADV_COLD)
>> folio_lock()						lock(ptl)
>> lock(ptl)						folio_lock()
>>
>> We can create a locked folio list, altogether drop both the locks, take the PTL,
>> do everything which __collapse_huge_page_isolate() does *except* the isolation and
>> again try locking folios, but then it will reduce efficiency of khugepaged
>> and almost looks like a forced solution :)
>> Please note the following discussion if anyone is interested:
>> https://lore.kernel.org/all/66bb7496-a445-4ad7-8e56-4f2863465c54@arm.com/
>> (Apologies for not CCing the mailing list from the start)
>>
>>   mm/khugepaged.c | 108 ++++++++++++++++++++++++++++++++++++++----------
>>   1 file changed, 87 insertions(+), 21 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 88beebef773e..8040b130e677 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -714,24 +714,28 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
>>   						struct vm_area_struct *vma,
>>   						unsigned long address,
>>   						spinlock_t *ptl,
>> -						struct list_head *compound_pagelist)
>> +						struct list_head *compound_pagelist, int order)
>>   {
>>   	struct folio *src, *tmp;
>>   	pte_t *_pte;
>>   	pte_t pteval;
>>   
>> -	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
>> +	for (_pte = pte; _pte < pte + (1UL << order);
>>   	     _pte++, address += PAGE_SIZE) {
>>   		pteval = ptep_get(_pte);
>>   		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
>>   			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
>>   			if (is_zero_pfn(pte_pfn(pteval))) {
>> -				/*
>> -				 * ptl mostly unnecessary.
>> -				 */
>> -				spin_lock(ptl);
>> -				ptep_clear(vma->vm_mm, address, _pte);
>> -				spin_unlock(ptl);
>> +				if (order == HPAGE_PMD_ORDER) {
>> +					/*
>> +					* ptl mostly unnecessary.
>> +					*/
>> +					spin_lock(ptl);
>> +					ptep_clear(vma->vm_mm, address, _pte);
>> +					spin_unlock(ptl);
>> +				} else {
>> +					ptep_clear(vma->vm_mm, address, _pte);
>> +				}
>>   				ksm_might_unmap_zero_page(vma->vm_mm, pteval);
>>   			}
>>   		} else {
>> @@ -740,15 +744,20 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
>>   			src = page_folio(src_page);
>>   			if (!folio_test_large(src))
>>   				release_pte_folio(src);
>> -			/*
>> -			 * ptl mostly unnecessary, but preempt has to
>> -			 * be disabled to update the per-cpu stats
>> -			 * inside folio_remove_rmap_pte().
>> -			 */
>> -			spin_lock(ptl);
>> -			ptep_clear(vma->vm_mm, address, _pte);
>> -			folio_remove_rmap_pte(src, src_page, vma);
>> -			spin_unlock(ptl);
>> +			if (order == HPAGE_PMD_ORDER) {
>> +				/*
>> +				* ptl mostly unnecessary, but preempt has to
>> +				* be disabled to update the per-cpu stats
>> +				* inside folio_remove_rmap_pte().
>> +				*/
>> +				spin_lock(ptl);
>> +				ptep_clear(vma->vm_mm, address, _pte);
>> +				folio_remove_rmap_pte(src, src_page, vma);
>> +				spin_unlock(ptl);
>> +			} else {
>> +				ptep_clear(vma->vm_mm, address, _pte);
>> +				folio_remove_rmap_pte(src, src_page, vma);
>> +			}
>>   			free_page_and_swap_cache(src_page);
>>   		}
>>   	}
>> @@ -807,7 +816,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
>>   static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
>>   		pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
>>   		unsigned long address, spinlock_t *ptl,
>> -		struct list_head *compound_pagelist)
>> +		struct list_head *compound_pagelist, int order)
>>   {
>>   	unsigned int i;
>>   	int result = SCAN_SUCCEED;
>> @@ -815,7 +824,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
>>   	/*
>>   	 * Copying pages' contents is subject to memory poison at any iteration.
>>   	 */
>> -	for (i = 0; i < HPAGE_PMD_NR; i++) {
>> +	for (i = 0; i < (1 << order); i++) {
>>   		pte_t pteval = ptep_get(pte + i);
>>   		struct page *page = folio_page(folio, i);
>>   		unsigned long src_addr = address + i * PAGE_SIZE;
>> @@ -834,7 +843,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
>>   
>>   	if (likely(result == SCAN_SUCCEED))
>>   		__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
>> -						    compound_pagelist);
>> +						    compound_pagelist, order);
>>   	else
>>   		__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
>>   						 compound_pagelist, order);
>> @@ -1196,7 +1205,7 @@ static int vma_collapse_anon_folio_pmd(struct mm_struct *mm, unsigned long addre
>>   
>>   	result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
>>   					   vma, address, pte_ptl,
>> -					   &compound_pagelist);
>> +					   &compound_pagelist, HPAGE_PMD_ORDER);
>>   	pte_unmap(pte);
>>   	if (unlikely(result != SCAN_SUCCEED))
>>   		goto out_up_write;
>> @@ -1228,6 +1237,61 @@ static int vma_collapse_anon_folio_pmd(struct mm_struct *mm, unsigned long addre
>>   	return result;
>>   }
>>   
>> +/* Enter with mmap read lock */
>> +static int vma_collapse_anon_folio(struct mm_struct *mm, unsigned long address,
>> +		struct vm_area_struct *vma, struct collapse_control *cc, pmd_t *pmd,
>> +		struct folio *folio, int order)
>> +{
>> +	int result;
>> +	struct mmu_notifier_range range;
>> +	spinlock_t *pte_ptl;
>> +	LIST_HEAD(compound_pagelist);
>> +	pte_t *pte;
>> +	pte_t entry;
>> +	int nr_pages = folio_nr_pages(folio);
>> +
>> +	anon_vma_lock_write(vma->anon_vma);
>> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
>> +				address + (PAGE_SIZE << order));
>> +	mmu_notifier_invalidate_range_start(&range);
>> +
>> +	pte = pte_offset_map_lock(mm, pmd, address, &pte_ptl);
>> +	if (pte)
>> +		result = __collapse_huge_page_isolate(vma, address, pte, cc,
>> +						      &compound_pagelist, order);
>> +	else
>> +		result = SCAN_PMD_NULL;
>> +
>> +	if (unlikely(result != SCAN_SUCCEED))
>> +		goto out_up_read;
>> +
>> +	anon_vma_unlock_write(vma->anon_vma);
>> +
>> +	__folio_mark_uptodate(folio);
>> +	entry = mk_pte(&folio->page, vma->vm_page_prot);
>> +	entry = maybe_mkwrite(entry, vma);
>> +
>> +	result = __collapse_huge_page_copy(pte, folio, pmd, *pmd,
>> +					   vma, address, pte_ptl,
>> +					   &compound_pagelist, order);
>> +	if (unlikely(result != SCAN_SUCCEED))
>> +		goto out_up_read;
>> +
>> +	folio_ref_add(folio, nr_pages - 1);
>> +	folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
>> +	folio_add_lru_vma(folio, vma);
>> +	deferred_split_folio(folio, false);
> Hi Dev,
>
> You are adding the lower order folios to the deferred split queue,
> but you havent changed the THP shrinker to take this into account.

Thanks for the observation!

>
> At memory pressure you will be doing a lot of work checking the contents of
> all mTHP pages which will be wasted unless you change the shrinker, something
> like below (unbuilt, untested) might work:
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index c89aed1510f1..f9586df40f67 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3788,7 +3788,7 @@ static bool thp_underused(struct folio *folio)
>                  kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
>                  if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
>                          num_zero_pages++;
> -                       if (num_zero_pages > khugepaged_max_ptes_none) {
> +                       if (num_zero_pages > khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - folio_order(folio))) {
>                                  kunmap_local(kaddr);
>                                  return true;
>                          }
>
>
> The question is, do we want the shrinker to be run for lower order mTHPs? It can consume
> a lot of CPU cycles and not be as useful as PMD order THPs. So instead of above, we could
> disable THP shrinker for lower orders?

Your suggestion makes sense to me. Freeing two (PMD - 1) mTHPs gives us the same amount of memory
as freeing one PMD THP, with the downside being, one extra iteration, one extra freeing, long deferred list
etc, basically more overhead.

>   
>
>> +	set_ptes(mm, address, pte, entry, nr_pages);
>> +	update_mmu_cache_range(NULL, vma, address, pte, nr_pages);
>> +	pte_unmap_unlock(pte, pte_ptl);
>> +	mmu_notifier_invalidate_range_end(&range);
>> +	result = SCAN_SUCCEED;
>> +
>> +out_up_read:
>> +	mmap_read_unlock(mm);
>> +	return result;
>> +}
>> +
>>   static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>>   			      int referenced, int unmapped, int order,
>>   			      struct collapse_control *cc)
>> @@ -1276,6 +1340,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>>   
>>   	if (order == HPAGE_PMD_ORDER)
>>   		result = vma_collapse_anon_folio_pmd(mm, address, vma, cc, pmd, folio);
>> +	else
>> +		result = vma_collapse_anon_folio(mm, address, vma, cc, pmd, folio, order);
>>   
>>   	if (result == SCAN_SUCCEED)
>>   		folio = NULL;
Re: [RFC PATCH 09/12] khugepaged: Introduce vma_collapse_anon_folio()
Posted by David Hildenbrand 1 year, 1 month ago
On 16.12.24 17:51, Dev Jain wrote:
> In contrast to PMD-collapse, we do not need to operate on two levels of pagetable
> simultaneously. Therefore, downgrade the mmap lock from write to read mode. Still
> take the anon_vma lock in exclusive mode so as to not waste time in the rmap path,
> which is anyways going to fail since the PTEs are going to be changed. Under the PTL,
> copy page contents, clear the PTEs, remove folio pins, and (try to) unmap the
> old folios. Set the PTEs to the new folio using the set_ptes() API.
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
> Note: I have been trying hard to get rid of the locks in here: we still are
> taking the PTL around the page copying; dropping the PTL and taking it after
> the copying should lead to a deadlock, for example:
> khugepaged						madvise(MADV_COLD)
> folio_lock()						lock(ptl)
> lock(ptl)						folio_lock()
> 
> We can create a locked folio list, altogether drop both the locks, take the PTL,
> do everything which __collapse_huge_page_isolate() does *except* the isolation and
> again try locking folios, but then it will reduce efficiency of khugepaged
> and almost looks like a forced solution :)
> Please note the following discussion if anyone is interested:
> https://lore.kernel.org/all/66bb7496-a445-4ad7-8e56-4f2863465c54@arm.com/
> (Apologies for not CCing the mailing list from the start)
> 
>   mm/khugepaged.c | 108 ++++++++++++++++++++++++++++++++++++++----------
>   1 file changed, 87 insertions(+), 21 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 88beebef773e..8040b130e677 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -714,24 +714,28 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
>   						struct vm_area_struct *vma,
>   						unsigned long address,
>   						spinlock_t *ptl,
> -						struct list_head *compound_pagelist)
> +						struct list_head *compound_pagelist, int order)
>   {
>   	struct folio *src, *tmp;
>   	pte_t *_pte;
>   	pte_t pteval;
>   
> -	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> +	for (_pte = pte; _pte < pte + (1UL << order);
>   	     _pte++, address += PAGE_SIZE) {
>   		pteval = ptep_get(_pte);
>   		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
>   			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
>   			if (is_zero_pfn(pte_pfn(pteval))) {
> -				/*
> -				 * ptl mostly unnecessary.
> -				 */
> -				spin_lock(ptl);
> -				ptep_clear(vma->vm_mm, address, _pte);
> -				spin_unlock(ptl);
> +				if (order == HPAGE_PMD_ORDER) {
> +					/*
> +					* ptl mostly unnecessary.
> +					*/
> +					spin_lock(ptl);
> +					ptep_clear(vma->vm_mm, address, _pte);
> +					spin_unlock(ptl);
> +				} else {
> +					ptep_clear(vma->vm_mm, address, _pte);
> +				}
>   				ksm_might_unmap_zero_page(vma->vm_mm, pteval);
>   			}
>   		} else {
> @@ -740,15 +744,20 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
>   			src = page_folio(src_page);
>   			if (!folio_test_large(src))
>   				release_pte_folio(src);
> -			/*
> -			 * ptl mostly unnecessary, but preempt has to
> -			 * be disabled to update the per-cpu stats
> -			 * inside folio_remove_rmap_pte().
> -			 */
> -			spin_lock(ptl);
> -			ptep_clear(vma->vm_mm, address, _pte);
> -			folio_remove_rmap_pte(src, src_page, vma);
> -			spin_unlock(ptl);
> +			if (order == HPAGE_PMD_ORDER) {
> +				/*
> +				* ptl mostly unnecessary, but preempt has to
> +				* be disabled to update the per-cpu stats
> +				* inside folio_remove_rmap_pte().
> +				*/
> +				spin_lock(ptl);
> +				ptep_clear(vma->vm_mm, address, _pte);




> +				folio_remove_rmap_pte(src, src_page, vma);
> +				spin_unlock(ptl);
> +			} else {
> +				ptep_clear(vma->vm_mm, address, _pte);
> +				folio_remove_rmap_pte(src, src_page, vma);
> +			}

As I've talked to Nico about this code recently ... :)

Are you clearing the PTE after the copy succeeded? If so, where is the 
TLB flush?

How do you sync against concurrent write acess + GUP-fast?


The sequence really must be: (1) clear PTE/PMD + flush TLB (2) check if 
there are unexpected page references (e.g., GUP) if so back off (3) 
copy page content (4) set updated PTE/PMD.

To Nico, I suggested doing it simple initially, and still clear the 
high-level PMD entry + flush under mmap write lock, then re-map the PTE 
table after modifying the page table. It's not as efficient, but "harder 
to get wrong".

Maybe that's already happening, but I stumbled over this clearing logic 
in __collapse_huge_page_copy_succeeded(), so I'm curious.

-- 
Cheers,

David / dhildenb
Re: [RFC PATCH 09/12] khugepaged: Introduce vma_collapse_anon_folio()
Posted by Dev Jain 1 year, 1 month ago
On 16/12/24 10:36 pm, David Hildenbrand wrote:
> On 16.12.24 17:51, Dev Jain wrote:
>> In contrast to PMD-collapse, we do not need to operate on two levels 
>> of pagetable
>> simultaneously. Therefore, downgrade the mmap lock from write to read 
>> mode. Still
>> take the anon_vma lock in exclusive mode so as to not waste time in 
>> the rmap path,
>> which is anyways going to fail since the PTEs are going to be 
>> changed. Under the PTL,
>> copy page contents, clear the PTEs, remove folio pins, and (try to) 
>> unmap the
>> old folios. Set the PTEs to the new folio using the set_ptes() API.
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>> Note: I have been trying hard to get rid of the locks in here: we 
>> still are
>> taking the PTL around the page copying; dropping the PTL and taking 
>> it after
>> the copying should lead to a deadlock, for example:
>> khugepaged                        madvise(MADV_COLD)
>> folio_lock()                        lock(ptl)
>> lock(ptl)                        folio_lock()
>>
>> We can create a locked folio list, altogether drop both the locks, 
>> take the PTL,
>> do everything which __collapse_huge_page_isolate() does *except* the 
>> isolation and
>> again try locking folios, but then it will reduce efficiency of 
>> khugepaged
>> and almost looks like a forced solution :)
>> Please note the following discussion if anyone is interested:
>> https://lore.kernel.org/all/66bb7496-a445-4ad7-8e56-4f2863465c54@arm.com/ 
>>
>> (Apologies for not CCing the mailing list from the start)
>>
>>   mm/khugepaged.c | 108 ++++++++++++++++++++++++++++++++++++++----------
>>   1 file changed, 87 insertions(+), 21 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 88beebef773e..8040b130e677 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -714,24 +714,28 @@ static void 
>> __collapse_huge_page_copy_succeeded(pte_t *pte,
>>                           struct vm_area_struct *vma,
>>                           unsigned long address,
>>                           spinlock_t *ptl,
>> -                        struct list_head *compound_pagelist)
>> +                        struct list_head *compound_pagelist, int order)
>>   {
>>       struct folio *src, *tmp;
>>       pte_t *_pte;
>>       pte_t pteval;
>>   -    for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
>> +    for (_pte = pte; _pte < pte + (1UL << order);
>>            _pte++, address += PAGE_SIZE) {
>>           pteval = ptep_get(_pte);
>>           if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
>>               add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
>>               if (is_zero_pfn(pte_pfn(pteval))) {
>> -                /*
>> -                 * ptl mostly unnecessary.
>> -                 */
>> -                spin_lock(ptl);
>> -                ptep_clear(vma->vm_mm, address, _pte);
>> -                spin_unlock(ptl);
>> +                if (order == HPAGE_PMD_ORDER) {
>> +                    /*
>> +                    * ptl mostly unnecessary.
>> +                    */
>> +                    spin_lock(ptl);
>> +                    ptep_clear(vma->vm_mm, address, _pte);
>> +                    spin_unlock(ptl);
>> +                } else {
>> +                    ptep_clear(vma->vm_mm, address, _pte);
>> +                }
>>                   ksm_might_unmap_zero_page(vma->vm_mm, pteval);
>>               }
>>           } else {
>> @@ -740,15 +744,20 @@ static void 
>> __collapse_huge_page_copy_succeeded(pte_t *pte,
>>               src = page_folio(src_page);
>>               if (!folio_test_large(src))
>>                   release_pte_folio(src);
>> -            /*
>> -             * ptl mostly unnecessary, but preempt has to
>> -             * be disabled to update the per-cpu stats
>> -             * inside folio_remove_rmap_pte().
>> -             */
>> -            spin_lock(ptl);
>> -            ptep_clear(vma->vm_mm, address, _pte);
>> -            folio_remove_rmap_pte(src, src_page, vma);
>> -            spin_unlock(ptl);
>> +            if (order == HPAGE_PMD_ORDER) {
>> +                /*
>> +                * ptl mostly unnecessary, but preempt has to
>> +                * be disabled to update the per-cpu stats
>> +                * inside folio_remove_rmap_pte().
>> +                */
>> +                spin_lock(ptl);
>> +                ptep_clear(vma->vm_mm, address, _pte);
>
>
>
>
>> + folio_remove_rmap_pte(src, src_page, vma);
>> +                spin_unlock(ptl);
>> +            } else {
>> +                ptep_clear(vma->vm_mm, address, _pte);
>> +                folio_remove_rmap_pte(src, src_page, vma);
>> +            }
>
> As I've talked to Nico about this code recently ... :)
>
> Are you clearing the PTE after the copy succeeded? If so, where is the 
> TLB flush?
>
> How do you sync against concurrent write acess + GUP-fast?
>
>
> The sequence really must be: (1) clear PTE/PMD + flush TLB (2) check 
> if there are unexpected page references (e.g., GUP) if so back off (3) 
> copy page content (4) set updated PTE/PMD.
>
> To Nico, I suggested doing it simple initially, and still clear the 
> high-level PMD entry + flush under mmap write lock, then re-map the 
> PTE table after modifying the page table. It's not as efficient, but 
> "harder to get wrong".

Btw if we are taking mmap write lock, then we do not require flushing 
PMD entry for mTHP collapse right?
>
> Maybe that's already happening, but I stumbled over this clearing 
> logic in __collapse_huge_page_copy_succeeded(), so I'm curious.
>
Re: [RFC PATCH 09/12] khugepaged: Introduce vma_collapse_anon_folio()
Posted by Dev Jain 1 year, 1 month ago
On 16/12/24 10:36 pm, David Hildenbrand wrote:
> On 16.12.24 17:51, Dev Jain wrote:
>> In contrast to PMD-collapse, we do not need to operate on two levels 
>> of pagetable
>> simultaneously. Therefore, downgrade the mmap lock from write to read 
>> mode. Still
>> take the anon_vma lock in exclusive mode so as to not waste time in 
>> the rmap path,
>> which is anyways going to fail since the PTEs are going to be 
>> changed. Under the PTL,
>> copy page contents, clear the PTEs, remove folio pins, and (try to) 
>> unmap the
>> old folios. Set the PTEs to the new folio using the set_ptes() API.
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>> Note: I have been trying hard to get rid of the locks in here: we 
>> still are
>> taking the PTL around the page copying; dropping the PTL and taking 
>> it after
>> the copying should lead to a deadlock, for example:
>> khugepaged                        madvise(MADV_COLD)
>> folio_lock()                        lock(ptl)
>> lock(ptl)                        folio_lock()
>>
>> We can create a locked folio list, altogether drop both the locks, 
>> take the PTL,
>> do everything which __collapse_huge_page_isolate() does *except* the 
>> isolation and
>> again try locking folios, but then it will reduce efficiency of 
>> khugepaged
>> and almost looks like a forced solution :)
>> Please note the following discussion if anyone is interested:
>> https://lore.kernel.org/all/66bb7496-a445-4ad7-8e56-4f2863465c54@arm.com/ 
>>
>> (Apologies for not CCing the mailing list from the start)
>>
>>   mm/khugepaged.c | 108 ++++++++++++++++++++++++++++++++++++++----------
>>   1 file changed, 87 insertions(+), 21 deletions(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 88beebef773e..8040b130e677 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -714,24 +714,28 @@ static void 
>> __collapse_huge_page_copy_succeeded(pte_t *pte,
>>                           struct vm_area_struct *vma,
>>                           unsigned long address,
>>                           spinlock_t *ptl,
>> -                        struct list_head *compound_pagelist)
>> +                        struct list_head *compound_pagelist, int order)
>>   {
>>       struct folio *src, *tmp;
>>       pte_t *_pte;
>>       pte_t pteval;
>>   -    for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
>> +    for (_pte = pte; _pte < pte + (1UL << order);
>>            _pte++, address += PAGE_SIZE) {
>>           pteval = ptep_get(_pte);
>>           if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
>>               add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
>>               if (is_zero_pfn(pte_pfn(pteval))) {
>> -                /*
>> -                 * ptl mostly unnecessary.
>> -                 */
>> -                spin_lock(ptl);
>> -                ptep_clear(vma->vm_mm, address, _pte);
>> -                spin_unlock(ptl);
>> +                if (order == HPAGE_PMD_ORDER) {
>> +                    /*
>> +                    * ptl mostly unnecessary.
>> +                    */
>> +                    spin_lock(ptl);
>> +                    ptep_clear(vma->vm_mm, address, _pte);
>> +                    spin_unlock(ptl);
>> +                } else {
>> +                    ptep_clear(vma->vm_mm, address, _pte);
>> +                }
>>                   ksm_might_unmap_zero_page(vma->vm_mm, pteval);
>>               }
>>           } else {
>> @@ -740,15 +744,20 @@ static void 
>> __collapse_huge_page_copy_succeeded(pte_t *pte,
>>               src = page_folio(src_page);
>>               if (!folio_test_large(src))
>>                   release_pte_folio(src);
>> -            /*
>> -             * ptl mostly unnecessary, but preempt has to
>> -             * be disabled to update the per-cpu stats
>> -             * inside folio_remove_rmap_pte().
>> -             */
>> -            spin_lock(ptl);
>> -            ptep_clear(vma->vm_mm, address, _pte);
>> -            folio_remove_rmap_pte(src, src_page, vma);
>> -            spin_unlock(ptl);
>> +            if (order == HPAGE_PMD_ORDER) {
>> +                /*
>> +                * ptl mostly unnecessary, but preempt has to
>> +                * be disabled to update the per-cpu stats
>> +                * inside folio_remove_rmap_pte().
>> +                */
>> +                spin_lock(ptl);
>> +                ptep_clear(vma->vm_mm, address, _pte);
>
>
>
>
>> + folio_remove_rmap_pte(src, src_page, vma);
>> +                spin_unlock(ptl);
>> +            } else {
>> +                ptep_clear(vma->vm_mm, address, _pte);
>> +                folio_remove_rmap_pte(src, src_page, vma);
>> +            }
>
> As I've talked to Nico about this code recently ... :)
>
> Are you clearing the PTE after the copy succeeded? If so, where is the 
> TLB flush?
>
> How do you sync against concurrent write acess + GUP-fast?
>
>
> The sequence really must be: (1) clear PTE/PMD + flush TLB (2) check 
> if there are unexpected page references (e.g., GUP) if so back off (3) 
> copy page content (4) set updated PTE/PMD.

Thanks...we need to ensure GUP-fast does not write when we are copying 
contents, so (2) will ensure that GUP-fast will see the cleared PTE and 
back-off.
>
> To Nico, I suggested doing it simple initially, and still clear the 
> high-level PMD entry + flush under mmap write lock, then re-map the 
> PTE table after modifying the page table. It's not as efficient, but 
> "harder to get wrong".
>
> Maybe that's already happening, but I stumbled over this clearing 
> logic in __collapse_huge_page_copy_succeeded(), so I'm curious.

No, I am not even touching the PMD. I guess the sequence you described 
should work? I just need to reverse the copying and PTE clearing order 
to implement this sequence.
Re: [RFC PATCH 09/12] khugepaged: Introduce vma_collapse_anon_folio()
Posted by David Hildenbrand 1 year, 1 month ago
On 17.12.24 11:07, Dev Jain wrote:
> 
> On 16/12/24 10:36 pm, David Hildenbrand wrote:
>> On 16.12.24 17:51, Dev Jain wrote:
>>> In contrast to PMD-collapse, we do not need to operate on two levels
>>> of pagetable
>>> simultaneously. Therefore, downgrade the mmap lock from write to read
>>> mode. Still
>>> take the anon_vma lock in exclusive mode so as to not waste time in
>>> the rmap path,
>>> which is anyways going to fail since the PTEs are going to be
>>> changed. Under the PTL,
>>> copy page contents, clear the PTEs, remove folio pins, and (try to)
>>> unmap the
>>> old folios. Set the PTEs to the new folio using the set_ptes() API.
>>>
>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>> ---
>>> Note: I have been trying hard to get rid of the locks in here: we
>>> still are
>>> taking the PTL around the page copying; dropping the PTL and taking
>>> it after
>>> the copying should lead to a deadlock, for example:
>>> khugepaged                        madvise(MADV_COLD)
>>> folio_lock()                        lock(ptl)
>>> lock(ptl)                        folio_lock()
>>>
>>> We can create a locked folio list, altogether drop both the locks,
>>> take the PTL,
>>> do everything which __collapse_huge_page_isolate() does *except* the
>>> isolation and
>>> again try locking folios, but then it will reduce efficiency of
>>> khugepaged
>>> and almost looks like a forced solution :)
>>> Please note the following discussion if anyone is interested:
>>> https://lore.kernel.org/all/66bb7496-a445-4ad7-8e56-4f2863465c54@arm.com/
>>>
>>> (Apologies for not CCing the mailing list from the start)
>>>
>>>    mm/khugepaged.c | 108 ++++++++++++++++++++++++++++++++++++++----------
>>>    1 file changed, 87 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index 88beebef773e..8040b130e677 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -714,24 +714,28 @@ static void
>>> __collapse_huge_page_copy_succeeded(pte_t *pte,
>>>                            struct vm_area_struct *vma,
>>>                            unsigned long address,
>>>                            spinlock_t *ptl,
>>> -                        struct list_head *compound_pagelist)
>>> +                        struct list_head *compound_pagelist, int order)
>>>    {
>>>        struct folio *src, *tmp;
>>>        pte_t *_pte;
>>>        pte_t pteval;
>>>    -    for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
>>> +    for (_pte = pte; _pte < pte + (1UL << order);
>>>             _pte++, address += PAGE_SIZE) {
>>>            pteval = ptep_get(_pte);
>>>            if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
>>>                add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
>>>                if (is_zero_pfn(pte_pfn(pteval))) {
>>> -                /*
>>> -                 * ptl mostly unnecessary.
>>> -                 */
>>> -                spin_lock(ptl);
>>> -                ptep_clear(vma->vm_mm, address, _pte);
>>> -                spin_unlock(ptl);
>>> +                if (order == HPAGE_PMD_ORDER) {
>>> +                    /*
>>> +                    * ptl mostly unnecessary.
>>> +                    */
>>> +                    spin_lock(ptl);
>>> +                    ptep_clear(vma->vm_mm, address, _pte);
>>> +                    spin_unlock(ptl);
>>> +                } else {
>>> +                    ptep_clear(vma->vm_mm, address, _pte);
>>> +                }
>>>                    ksm_might_unmap_zero_page(vma->vm_mm, pteval);
>>>                }
>>>            } else {
>>> @@ -740,15 +744,20 @@ static void
>>> __collapse_huge_page_copy_succeeded(pte_t *pte,
>>>                src = page_folio(src_page);
>>>                if (!folio_test_large(src))
>>>                    release_pte_folio(src);
>>> -            /*
>>> -             * ptl mostly unnecessary, but preempt has to
>>> -             * be disabled to update the per-cpu stats
>>> -             * inside folio_remove_rmap_pte().
>>> -             */
>>> -            spin_lock(ptl);
>>> -            ptep_clear(vma->vm_mm, address, _pte);
>>> -            folio_remove_rmap_pte(src, src_page, vma);
>>> -            spin_unlock(ptl);
>>> +            if (order == HPAGE_PMD_ORDER) {
>>> +                /*
>>> +                * ptl mostly unnecessary, but preempt has to
>>> +                * be disabled to update the per-cpu stats
>>> +                * inside folio_remove_rmap_pte().
>>> +                */
>>> +                spin_lock(ptl);
>>> +                ptep_clear(vma->vm_mm, address, _pte);
>>
>>
>>
>>
>>> + folio_remove_rmap_pte(src, src_page, vma);
>>> +                spin_unlock(ptl);
>>> +            } else {
>>> +                ptep_clear(vma->vm_mm, address, _pte);
>>> +                folio_remove_rmap_pte(src, src_page, vma);
>>> +            }
>>
>> As I've talked to Nico about this code recently ... :)
>>
>> Are you clearing the PTE after the copy succeeded? If so, where is the
>> TLB flush?
>>
>> How do you sync against concurrent write acess + GUP-fast?
>>
>>
>> The sequence really must be: (1) clear PTE/PMD + flush TLB (2) check
>> if there are unexpected page references (e.g., GUP) if so back off (3)
>> copy page content (4) set updated PTE/PMD.
> 
> Thanks...we need to ensure GUP-fast does not write when we are copying
> contents, so (2) will ensure that GUP-fast will see the cleared PTE and
> back-off.

Yes, and of course, that also the CPU cannot concurrently still modify 
the page content while/after you copy the page content, but before you 
unmap+flush.

>>
>> To Nico, I suggested doing it simple initially, and still clear the
>> high-level PMD entry + flush under mmap write lock, then re-map the
>> PTE table after modifying the page table. It's not as efficient, but
>> "harder to get wrong".
>>
>> Maybe that's already happening, but I stumbled over this clearing
>> logic in __collapse_huge_page_copy_succeeded(), so I'm curious.
> 
> No, I am not even touching the PMD. I guess the sequence you described
> should work? I just need to reverse the copying and PTE clearing order
> to implement this sequence.

That would work, but you really have to hold the PTL for the whole 
period: from when you temporarily clear PTEs +_ flush the TLB, when you 
copy, until you re-insert the updated ones.

When having to back-off (restore original PTEs), or for copying, you'll 
likely need access to the original PTEs, which were already cleared. So 
likely you need a temporary copy of the original PTEs somehow.

That's why temporarily clearing the PMD und mmap write lock is easier to 
implement, at the cost of requiring the mmap lock in write mode like PMD 
collapse.

-- 
Cheers,

David / dhildenb

Re: [RFC PATCH 09/12] khugepaged: Introduce vma_collapse_anon_folio()
Posted by Dev Jain 1 year, 1 month ago
On 17/12/24 4:02 pm, David Hildenbrand wrote:
> On 17.12.24 11:07, Dev Jain wrote:
>>
>> On 16/12/24 10:36 pm, David Hildenbrand wrote:
>>> On 16.12.24 17:51, Dev Jain wrote:
>>>> In contrast to PMD-collapse, we do not need to operate on two levels
>>>> of pagetable
>>>> simultaneously. Therefore, downgrade the mmap lock from write to read
>>>> mode. Still
>>>> take the anon_vma lock in exclusive mode so as to not waste time in
>>>> the rmap path,
>>>> which is anyways going to fail since the PTEs are going to be
>>>> changed. Under the PTL,
>>>> copy page contents, clear the PTEs, remove folio pins, and (try to)
>>>> unmap the
>>>> old folios. Set the PTEs to the new folio using the set_ptes() API.
>>>>
>>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>>> ---
>>>> Note: I have been trying hard to get rid of the locks in here: we
>>>> still are
>>>> taking the PTL around the page copying; dropping the PTL and taking
>>>> it after
>>>> the copying should lead to a deadlock, for example:
>>>> khugepaged                        madvise(MADV_COLD)
>>>> folio_lock()                        lock(ptl)
>>>> lock(ptl)                        folio_lock()
>>>>
>>>> We can create a locked folio list, altogether drop both the locks,
>>>> take the PTL,
>>>> do everything which __collapse_huge_page_isolate() does *except* the
>>>> isolation and
>>>> again try locking folios, but then it will reduce efficiency of
>>>> khugepaged
>>>> and almost looks like a forced solution :)
>>>> Please note the following discussion if anyone is interested:
>>>> https://lore.kernel.org/all/66bb7496-a445-4ad7-8e56-4f2863465c54@arm.com/ 
>>>>
>>>>
>>>> (Apologies for not CCing the mailing list from the start)
>>>>
>>>>    mm/khugepaged.c | 108 
>>>> ++++++++++++++++++++++++++++++++++++++----------
>>>>    1 file changed, 87 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>>> index 88beebef773e..8040b130e677 100644
>>>> --- a/mm/khugepaged.c
>>>> +++ b/mm/khugepaged.c
>>>> @@ -714,24 +714,28 @@ static void
>>>> __collapse_huge_page_copy_succeeded(pte_t *pte,
>>>>                            struct vm_area_struct *vma,
>>>>                            unsigned long address,
>>>>                            spinlock_t *ptl,
>>>> -                        struct list_head *compound_pagelist)
>>>> +                        struct list_head *compound_pagelist, int 
>>>> order)
>>>>    {
>>>>        struct folio *src, *tmp;
>>>>        pte_t *_pte;
>>>>        pte_t pteval;
>>>>    -    for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
>>>> +    for (_pte = pte; _pte < pte + (1UL << order);
>>>>             _pte++, address += PAGE_SIZE) {
>>>>            pteval = ptep_get(_pte);
>>>>            if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
>>>>                add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
>>>>                if (is_zero_pfn(pte_pfn(pteval))) {
>>>> -                /*
>>>> -                 * ptl mostly unnecessary.
>>>> -                 */
>>>> -                spin_lock(ptl);
>>>> -                ptep_clear(vma->vm_mm, address, _pte);
>>>> -                spin_unlock(ptl);
>>>> +                if (order == HPAGE_PMD_ORDER) {
>>>> +                    /*
>>>> +                    * ptl mostly unnecessary.
>>>> +                    */
>>>> +                    spin_lock(ptl);
>>>> +                    ptep_clear(vma->vm_mm, address, _pte);
>>>> +                    spin_unlock(ptl);
>>>> +                } else {
>>>> +                    ptep_clear(vma->vm_mm, address, _pte);
>>>> +                }
>>>>                    ksm_might_unmap_zero_page(vma->vm_mm, pteval);
>>>>                }
>>>>            } else {
>>>> @@ -740,15 +744,20 @@ static void
>>>> __collapse_huge_page_copy_succeeded(pte_t *pte,
>>>>                src = page_folio(src_page);
>>>>                if (!folio_test_large(src))
>>>>                    release_pte_folio(src);
>>>> -            /*
>>>> -             * ptl mostly unnecessary, but preempt has to
>>>> -             * be disabled to update the per-cpu stats
>>>> -             * inside folio_remove_rmap_pte().
>>>> -             */
>>>> -            spin_lock(ptl);
>>>> -            ptep_clear(vma->vm_mm, address, _pte);
>>>> -            folio_remove_rmap_pte(src, src_page, vma);
>>>> -            spin_unlock(ptl);
>>>> +            if (order == HPAGE_PMD_ORDER) {
>>>> +                /*
>>>> +                * ptl mostly unnecessary, but preempt has to
>>>> +                * be disabled to update the per-cpu stats
>>>> +                * inside folio_remove_rmap_pte().
>>>> +                */
>>>> +                spin_lock(ptl);
>>>> +                ptep_clear(vma->vm_mm, address, _pte);
>>>
>>>
>>>
>>>
>>>> + folio_remove_rmap_pte(src, src_page, vma);
>>>> +                spin_unlock(ptl);
>>>> +            } else {
>>>> +                ptep_clear(vma->vm_mm, address, _pte);
>>>> +                folio_remove_rmap_pte(src, src_page, vma);
>>>> +            }
>>>
>>> As I've talked to Nico about this code recently ... :)
>>>
>>> Are you clearing the PTE after the copy succeeded? If so, where is the
>>> TLB flush?
>>>
>>> How do you sync against concurrent write acess + GUP-fast?
>>>
>>>
>>> The sequence really must be: (1) clear PTE/PMD + flush TLB (2) check
>>> if there are unexpected page references (e.g., GUP) if so back off (3)
>>> copy page content (4) set updated PTE/PMD.
>>
>> Thanks...we need to ensure GUP-fast does not write when we are copying
>> contents, so (2) will ensure that GUP-fast will see the cleared PTE and
>> back-off.
>
> Yes, and of course, that also the CPU cannot concurrently still modify 
> the page content while/after you copy the page content, but before you 
> unmap+flush.
>
>>>
>>> To Nico, I suggested doing it simple initially, and still clear the
>>> high-level PMD entry + flush under mmap write lock, then re-map the
>>> PTE table after modifying the page table. It's not as efficient, but
>>> "harder to get wrong".
>>>
>>> Maybe that's already happening, but I stumbled over this clearing
>>> logic in __collapse_huge_page_copy_succeeded(), so I'm curious.
>>
>> No, I am not even touching the PMD. I guess the sequence you described
>> should work? I just need to reverse the copying and PTE clearing order
>> to implement this sequence.
>
> That would work, but you really have to hold the PTL for the whole 
> period: from when you temporarily clear PTEs +_ flush the TLB, when 
> you copy, until you re-insert the updated ones.

Ignoring the implementation and code churn part :) Is the following 
algorithm theoretically correct: (1) Take PTL, scan PTEs,
isolate and lock the folios, set the PTEs to migration entries, check 
folio references. This will solve concurrent write
access races. Now, we can drop the PTL...no one can write to the old 
folios because (1) rmap cannot run (2) folio from PTE
cannot be derived. Note that migration_entry_wait_on_locked() path can 
be scheduled out, so this is not the same as the
fault handlers spinning on the PTL. We can now safely copy old folios to 
new folio, then take the PTL: The PTL is
available because every pagetable walker will see a migration entry and 
back off. We batch set the PTEs now, and release
the folio locks, making the fault handlers getting out of 
migration_entry_wait_on_locked(). As compared to the old code,
the point of failure we need to handle is when copying fails, or at some 
point folio isolation fails...therefore, we need to
maintain a list of old PTEs corresponding to the PTEs set to migration 
entries.

Note that, I had suggested this "setting the PTEs to a global invalid 
state" thingy in our previous discussion too, but I guess
simultaneously working on the PMD and PTE was the main problem there, 
since the walkers do not take a lock on the PMD
to check if someone is changing it, when what they really are interested 
in is to make change at the PTE level. In fact, leaving
all specifics like racing with a specific pagetable walker etc aside, I 
do not see why the following claim isn't true:

Claim: The (anon-private) mTHP khugepaged collapse problem is 
mathematically equivalent to the (anon-private) page migration problem.

The difference being, in khugepaged we need the VMA to be stable, hence 
have to take the mmap_read_lock(), and have to "migrate"
to a large folio instead of individual pages.

If at all my theory is correct, I'll leave it to the community to decide 
if it's worth it to go through my brain-rot :)


>
> When having to back-off (restore original PTEs), or for copying, 
> you'll likely need access to the original PTEs, which were already 
> cleared. So likely you need a temporary copy of the original PTEs 
> somehow.
>
> That's why temporarily clearing the PMD und mmap write lock is easier 
> to implement, at the cost of requiring the mmap lock in write mode 
> like PMD collapse.
>
>
So, I understand the following: Some CPU spinning on the PTL for a long 
time is worse than taking the mmap_write_lock(). The latter blocks this 
process
from doing mmap()s, which, in my limited knowledge, is bad for 
memory-intensive processes (aligned with the fact that the maple tree was
introduced to optimize VMA operations), and the former literally nukes 
one unit of computation from the system for a long time.
Re: [RFC PATCH 09/12] khugepaged: Introduce vma_collapse_anon_folio()
Posted by David Hildenbrand 1 year, 1 month ago
Still on PTO, but replying to this mail :)

>>>>
>>>> To Nico, I suggested doing it simple initially, and still clear the
>>>> high-level PMD entry + flush under mmap write lock, then re-map the
>>>> PTE table after modifying the page table. It's not as efficient, but
>>>> "harder to get wrong".
>>>>
>>>> Maybe that's already happening, but I stumbled over this clearing
>>>> logic in __collapse_huge_page_copy_succeeded(), so I'm curious.
>>>
>>> No, I am not even touching the PMD. I guess the sequence you described
>>> should work? I just need to reverse the copying and PTE clearing order
>>> to implement this sequence.
>>
>> That would work, but you really have to hold the PTL for the whole
>> period: from when you temporarily clear PTEs +_ flush the TLB, when
>> you copy, until you re-insert the updated ones.
> 
> Ignoring the implementation and code churn part :) Is the following
> algorithm theoretically correct: (1) Take PTL, scan PTEs,
> isolate and lock the folios, set the PTEs to migration entries, check
> folio references. This will solve concurrent write
> access races.
 > Now, we can drop the PTL...no one can write to the old> folios 
because (1) rmap cannot run (2) folio from PTE
> cannot be derived. Note that migration_entry_wait_on_locked() path can
> be scheduled out, so this is not the same as the
> fault handlers spinning on the PTL. We can now safely copy old folios to
> new folio, then take the PTL: The PTL is
> available because every pagetable walker will see a migration entry and
> back off. We batch set the PTEs now, and release
> the folio locks, making the fault handlers getting out of
> migration_entry_wait_on_locked(). As compared to the old code,
> the point of failure we need to handle is when copying fails, or at some
> point folio isolation fails...therefore, we need to
> maintain a list of old PTEs corresponding to the PTEs set to migration
> entries.
> 
> Note that, I had suggested this "setting the PTEs to a global invalid
> state" thingy in our previous discussion too, but I guess
> simultaneously working on the PMD and PTE was the main problem there,
> since the walkers do not take a lock on the PMD
> to check if someone is changing it, when what they really are interested
> in is to make change at the PTE level. In fact, leaving
> all specifics like racing with a specific pagetable walker etc aside, I
> do not see why the following claim isn't true:
> 
> Claim: The (anon-private) mTHP khugepaged collapse problem is
> mathematically equivalent to the (anon-private) page migration problem.
> 
> The difference being, in khugepaged we need the VMA to be stable, hence
> have to take the mmap_read_lock(), and have to "migrate"
> to a large folio instead of individual pages.
> 
> If at all my theory is correct, I'll leave it to the community to decide
> if it's worth it to go through my brain-rot :)

What we have to achieve is

a) Make sure GUP-fast cannot grab the folio
b) The CPU cannot read/write the folio
c) No "ordinary" page table walkers can grab the folio.

Handling a) and b) involves either invalidating (incl migration entry) 
or temporarily clearing (what we do right now for the PMD) the affected 
entry and flushing the TLB.

We can use migration entries while the folio is locked; there might be 
some devil in the detail, so I would suggest to going with something 
simpler first, and then try making use of migration entries.


> 
> 
>>
>> When having to back-off (restore original PTEs), or for copying,
>> you'll likely need access to the original PTEs, which were already
>> cleared. So likely you need a temporary copy of the original PTEs
>> somehow.
>>
>> That's why temporarily clearing the PMD und mmap write lock is easier
>> to implement, at the cost of requiring the mmap lock in write mode
>> like PMD collapse.
>>
>>
> So, I understand the following: Some CPU spinning on the PTL for a long
> time is worse than taking the mmap_write_lock(). The latter blocks this
> process
> from doing mmap()s, which, in my limited knowledge, is bad for
> memory-intensive processes (aligned with the fact that the maple tree was
> introduced to optimize VMA operations), and the former literally nukes
> one unit of computation from the system for a long time.

With per-VMA locks, khugepaged grabbing the mmap long in write mode got 
"less" bad, because most page fault can still make progress. But it's 
certainly still suboptimal.

Yes, I think having a lot of thread spinning for a long time for a PTL 
can be worse than using a sleepable lock in some scenarios I think; 
especially if the PTL spans more than a single page table.

-- 
Cheers,

David / dhildenb
Re: [RFC PATCH 09/12] khugepaged: Introduce vma_collapse_anon_folio()
Posted by Dev Jain 1 year, 1 month ago
On 18/12/24 2:05 pm, Dev Jain wrote:
>
> On 17/12/24 4:02 pm, David Hildenbrand wrote:
>> On 17.12.24 11:07, Dev Jain wrote:
>>>
>>> On 16/12/24 10:36 pm, David Hildenbrand wrote:
>>>> On 16.12.24 17:51, Dev Jain wrote:
>>>>> In contrast to PMD-collapse, we do not need to operate on two levels
>>>>> of pagetable
>>>>> simultaneously. Therefore, downgrade the mmap lock from write to read
>>>>> mode. Still
>>>>> take the anon_vma lock in exclusive mode so as to not waste time in
>>>>> the rmap path,
>>>>> which is anyways going to fail since the PTEs are going to be
>>>>> changed. Under the PTL,
>>>>> copy page contents, clear the PTEs, remove folio pins, and (try to)
>>>>> unmap the
>>>>> old folios. Set the PTEs to the new folio using the set_ptes() API.
>>>>>
>>>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>>>> ---
>>>>> Note: I have been trying hard to get rid of the locks in here: we
>>>>> still are
>>>>> taking the PTL around the page copying; dropping the PTL and taking
>>>>> it after
>>>>> the copying should lead to a deadlock, for example:
>>>>> khugepaged                        madvise(MADV_COLD)
>>>>> folio_lock()                        lock(ptl)
>>>>> lock(ptl)                        folio_lock()
>>>>>
>>>>> We can create a locked folio list, altogether drop both the locks,
>>>>> take the PTL,
>>>>> do everything which __collapse_huge_page_isolate() does *except* the
>>>>> isolation and
>>>>> again try locking folios, but then it will reduce efficiency of
>>>>> khugepaged
>>>>> and almost looks like a forced solution :)
>>>>> Please note the following discussion if anyone is interested:
>>>>> https://lore.kernel.org/all/66bb7496-a445-4ad7-8e56-4f2863465c54@arm.com/ 
>>>>>
>>>>>
>>>>> (Apologies for not CCing the mailing list from the start)
>>>>>
>>>>>    mm/khugepaged.c | 108 
>>>>> ++++++++++++++++++++++++++++++++++++++----------
>>>>>    1 file changed, 87 insertions(+), 21 deletions(-)
>>>>>
>>>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>>>> index 88beebef773e..8040b130e677 100644
>>>>> --- a/mm/khugepaged.c
>>>>> +++ b/mm/khugepaged.c
>>>>> @@ -714,24 +714,28 @@ static void
>>>>> __collapse_huge_page_copy_succeeded(pte_t *pte,
>>>>>                            struct vm_area_struct *vma,
>>>>>                            unsigned long address,
>>>>>                            spinlock_t *ptl,
>>>>> -                        struct list_head *compound_pagelist)
>>>>> +                        struct list_head *compound_pagelist, int 
>>>>> order)
>>>>>    {
>>>>>        struct folio *src, *tmp;
>>>>>        pte_t *_pte;
>>>>>        pte_t pteval;
>>>>>    -    for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
>>>>> +    for (_pte = pte; _pte < pte + (1UL << order);
>>>>>             _pte++, address += PAGE_SIZE) {
>>>>>            pteval = ptep_get(_pte);
>>>>>            if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
>>>>>                add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
>>>>>                if (is_zero_pfn(pte_pfn(pteval))) {
>>>>> -                /*
>>>>> -                 * ptl mostly unnecessary.
>>>>> -                 */
>>>>> -                spin_lock(ptl);
>>>>> -                ptep_clear(vma->vm_mm, address, _pte);
>>>>> -                spin_unlock(ptl);
>>>>> +                if (order == HPAGE_PMD_ORDER) {
>>>>> +                    /*
>>>>> +                    * ptl mostly unnecessary.
>>>>> +                    */
>>>>> +                    spin_lock(ptl);
>>>>> +                    ptep_clear(vma->vm_mm, address, _pte);
>>>>> +                    spin_unlock(ptl);
>>>>> +                } else {
>>>>> +                    ptep_clear(vma->vm_mm, address, _pte);
>>>>> +                }
>>>>> ksm_might_unmap_zero_page(vma->vm_mm, pteval);
>>>>>                }
>>>>>            } else {
>>>>> @@ -740,15 +744,20 @@ static void
>>>>> __collapse_huge_page_copy_succeeded(pte_t *pte,
>>>>>                src = page_folio(src_page);
>>>>>                if (!folio_test_large(src))
>>>>>                    release_pte_folio(src);
>>>>> -            /*
>>>>> -             * ptl mostly unnecessary, but preempt has to
>>>>> -             * be disabled to update the per-cpu stats
>>>>> -             * inside folio_remove_rmap_pte().
>>>>> -             */
>>>>> -            spin_lock(ptl);
>>>>> -            ptep_clear(vma->vm_mm, address, _pte);
>>>>> -            folio_remove_rmap_pte(src, src_page, vma);
>>>>> -            spin_unlock(ptl);
>>>>> +            if (order == HPAGE_PMD_ORDER) {
>>>>> +                /*
>>>>> +                * ptl mostly unnecessary, but preempt has to
>>>>> +                * be disabled to update the per-cpu stats
>>>>> +                * inside folio_remove_rmap_pte().
>>>>> +                */
>>>>> +                spin_lock(ptl);
>>>>> +                ptep_clear(vma->vm_mm, address, _pte);
>>>>
>>>>
>>>>
>>>>
>>>>> + folio_remove_rmap_pte(src, src_page, vma);
>>>>> +                spin_unlock(ptl);
>>>>> +            } else {
>>>>> +                ptep_clear(vma->vm_mm, address, _pte);
>>>>> +                folio_remove_rmap_pte(src, src_page, vma);
>>>>> +            }
>>>>
>>>> As I've talked to Nico about this code recently ... :)
>>>>
>>>> Are you clearing the PTE after the copy succeeded? If so, where is the
>>>> TLB flush?
>>>>
>>>> How do you sync against concurrent write acess + GUP-fast?
>>>>
>>>>
>>>> The sequence really must be: (1) clear PTE/PMD + flush TLB (2) check
>>>> if there are unexpected page references (e.g., GUP) if so back off (3)
>>>> copy page content (4) set updated PTE/PMD.
>>>
>>> Thanks...we need to ensure GUP-fast does not write when we are copying
>>> contents, so (2) will ensure that GUP-fast will see the cleared PTE and
>>> back-off.
>>
>> Yes, and of course, that also the CPU cannot concurrently still 
>> modify the page content while/after you copy the page content, but 
>> before you unmap+flush.
>>
>>>>
>>>> To Nico, I suggested doing it simple initially, and still clear the
>>>> high-level PMD entry + flush under mmap write lock, then re-map the
>>>> PTE table after modifying the page table. It's not as efficient, but
>>>> "harder to get wrong".
>>>>
>>>> Maybe that's already happening, but I stumbled over this clearing
>>>> logic in __collapse_huge_page_copy_succeeded(), so I'm curious.
>>>
>>> No, I am not even touching the PMD. I guess the sequence you described
>>> should work? I just need to reverse the copying and PTE clearing order
>>> to implement this sequence.
>>
>> That would work, but you really have to hold the PTL for the whole 
>> period: from when you temporarily clear PTEs +_ flush the TLB, when 
>> you copy, until you re-insert the updated ones.
>
> Ignoring the implementation and code churn part :) Is the following 
> algorithm theoretically correct: (1) Take PTL, scan PTEs,
> isolate and lock the folios, set the PTEs to migration entries, check 
> folio references. This will solve concurrent write
> access races. Now, we can drop the PTL...no one can write to the old 
> folios because (1) rmap cannot run (2) folio from PTE
> cannot be derived. Note that migration_entry_wait_on_locked() path can 
> be scheduled out, so this is not the same as the
> fault handlers spinning on the PTL. We can now safely copy old folios 
> to new folio, then take the PTL: The PTL is
> available because every pagetable walker will see a migration entry 
> and back off. We batch set the PTEs now, and release
> the folio locks, making the fault handlers getting out of 
> migration_entry_wait_on_locked(). As compared to the old code,
> the point of failure we need to handle is when copying fails, or at 
> some point folio isolation fails...therefore, we need to
> maintain a list of old PTEs corresponding to the PTEs set to migration 
> entries.
>
> Note that, I had suggested this "setting the PTEs to a global invalid 
> state" thingy in our previous discussion too, but I guess
> simultaneously working on the PMD and PTE was the main problem there, 
> since the walkers do not take a lock on the PMD
> to check if someone is changing it, when what they really are 
> interested in is to make change at the PTE level. In fact, leaving
> all specifics like racing with a specific pagetable walker etc aside, 
> I do not see why the following claim isn't true:
>
> Claim: The (anon-private) mTHP khugepaged collapse problem is 
> mathematically equivalent to the (anon-private) page migration problem.
>
> The difference being, in khugepaged we need the VMA to be stable, 
> hence have to take the mmap_read_lock(), and have to "migrate"
> to a large folio instead of individual pages.
>
> If at all my theory is correct, I'll leave it to the community to 
> decide if it's worth it to go through my brain-rot :)

If more thinking is required on this sequence, I can postpone this to a 
future optimization patch.

>
>
>>
>> When having to back-off (restore original PTEs), or for copying, 
>> you'll likely need access to the original PTEs, which were already 
>> cleared. So likely you need a temporary copy of the original PTEs 
>> somehow.
>>
>> That's why temporarily clearing the PMD und mmap write lock is easier 
>> to implement, at the cost of requiring the mmap lock in write mode 
>> like PMD collapse.

Why do I need to clear the PMD if I am taking the mmap_write_lock() and 
operating only on the PTE?
>>
>>
> So, I understand the following: Some CPU spinning on the PTL for a 
> long time is worse than taking the mmap_write_lock(). The latter 
> blocks this process
> from doing mmap()s, which, in my limited knowledge, is bad for 
> memory-intensive processes (aligned with the fact that the maple tree was
> introduced to optimize VMA operations), and the former literally nukes 
> one unit of computation from the system for a long time.
>
Re: [RFC PATCH 09/12] khugepaged: Introduce vma_collapse_anon_folio()
Posted by David Hildenbrand 1 year, 1 month ago
>>>
>>> When having to back-off (restore original PTEs), or for copying,
>>> you'll likely need access to the original PTEs, which were already
>>> cleared. So likely you need a temporary copy of the original PTEs
>>> somehow.
>>>
>>> That's why temporarily clearing the PMD und mmap write lock is easier
>>> to implement, at the cost of requiring the mmap lock in write mode
>>> like PMD collapse.
> 
> Why do I need to clear the PMD if I am taking the mmap_write_lock() and
> operating only on the PTE?

One approach I proposed to Nico (and I think he has a prototype) is:

a) Take all locks like we do today (mmap in write, vma in write, rmap in 
write)

After this step, no "ordinary" page table walkers can run anymore

b) Clear the PMD entry and flush the TLB like we do today

After this step, neither the CPU can read/write folios nor GUP-fast can 
run. The PTE table is completely isolated.

c) Now we can work on the (temporarily cleared) PTE table as we please: 
isolate folios, lock them, ... without clearing the PTE entries, just 
like we do today.

d) Allocate the new folios (we don't have to hold any spinlocks), copy + 
replace the affected PTE entries in the isolated PTE table. Similar to 
what we do today, except that we don't clear PTEs but instead clear+reset.

e) Unlock+un-isolate + unref the collapsed folios like we do today.

f) Re-map the PTE-table, like we do today when collapse would have failed.


Of course, after taking all locks we have to re-verify that there is 
something to collapse (e.g., in d) we also have to check for unexpected 
folio references). The backup path is easy: remap the PTE table as no 
PTE entries were touched just yet.

Observe that many things are "like we do today".


As soon as we go to read locks + PTE locks, it all gets more complicated 
to get it right. Not that it cannot be done, but the above is IMHO a lot 
simpler to get right.

-- 
Cheers,

David / dhildenb
Re: [RFC PATCH 09/12] khugepaged: Introduce vma_collapse_anon_folio()
Posted by Dev Jain 1 year, 1 month ago
On 02/01/25 5:03 pm, David Hildenbrand wrote:
>>>>
>>>> When having to back-off (restore original PTEs), or for copying,
>>>> you'll likely need access to the original PTEs, which were already
>>>> cleared. So likely you need a temporary copy of the original PTEs
>>>> somehow.
>>>>
>>>> That's why temporarily clearing the PMD und mmap write lock is easier
>>>> to implement, at the cost of requiring the mmap lock in write mode
>>>> like PMD collapse.
>>
>> Why do I need to clear the PMD if I am taking the mmap_write_lock() and
>> operating only on the PTE?
>
> One approach I proposed to Nico (and I think he has a prototype) is:
>
> a) Take all locks like we do today (mmap in write, vma in write, rmap 
> in write)
>
> After this step, no "ordinary" page table walkers can run anymore
>
> b) Clear the PMD entry and flush the TLB like we do today
>
> After this step, neither the CPU can read/write folios nor GUP-fast 
> can run. The PTE table is completely isolated.
>
> c) Now we can work on the (temporarily cleared) PTE table as we 
> please: isolate folios, lock them, ... without clearing the PTE 
> entries, just like we do today.
>
> d) Allocate the new folios (we don't have to hold any spinlocks), copy 
> + replace the affected PTE entries in the isolated PTE table. Similar 
> to what we do today, except that we don't clear PTEs but instead 
> clear+reset.
>
> e) Unlock+un-isolate + unref the collapsed folios like we do today.
>
> f) Re-map the PTE-table, like we do today when collapse would have 
> failed.
>
>
> Of course, after taking all locks we have to re-verify that there is 
> something to collapse (e.g., in d) we also have to check for 
> unexpected folio references). The backup path is easy: remap the PTE 
> table as no PTE entries were touched just yet.
>
> Observe that many things are "like we do today".
>
>
> As soon as we go to read locks + PTE locks, it all gets more 
> complicated to get it right. Not that it cannot be done, but the above 
> is IMHO a lot simpler to get right.

Thanks for the reply. I'll go ahead with the write lock algorithm then.
Re: [RFC PATCH 09/12] khugepaged: Introduce vma_collapse_anon_folio()
Posted by Yang Shi 1 year, 1 month ago
On Mon, Dec 16, 2024 at 9:09 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 16.12.24 17:51, Dev Jain wrote:
> > In contrast to PMD-collapse, we do not need to operate on two levels of pagetable
> > simultaneously. Therefore, downgrade the mmap lock from write to read mode. Still
> > take the anon_vma lock in exclusive mode so as to not waste time in the rmap path,
> > which is anyways going to fail since the PTEs are going to be changed. Under the PTL,
> > copy page contents, clear the PTEs, remove folio pins, and (try to) unmap the
> > old folios. Set the PTEs to the new folio using the set_ptes() API.
> >
> > Signed-off-by: Dev Jain <dev.jain@arm.com>
> > ---
> > Note: I have been trying hard to get rid of the locks in here: we still are
> > taking the PTL around the page copying; dropping the PTL and taking it after
> > the copying should lead to a deadlock, for example:
> > khugepaged                                            madvise(MADV_COLD)
> > folio_lock()                                          lock(ptl)
> > lock(ptl)                                             folio_lock()
> >
> > We can create a locked folio list, altogether drop both the locks, take the PTL,
> > do everything which __collapse_huge_page_isolate() does *except* the isolation and
> > again try locking folios, but then it will reduce efficiency of khugepaged
> > and almost looks like a forced solution :)
> > Please note the following discussion if anyone is interested:
> > https://lore.kernel.org/all/66bb7496-a445-4ad7-8e56-4f2863465c54@arm.com/
> > (Apologies for not CCing the mailing list from the start)
> >
> >   mm/khugepaged.c | 108 ++++++++++++++++++++++++++++++++++++++----------
> >   1 file changed, 87 insertions(+), 21 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 88beebef773e..8040b130e677 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -714,24 +714,28 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> >                                               struct vm_area_struct *vma,
> >                                               unsigned long address,
> >                                               spinlock_t *ptl,
> > -                                             struct list_head *compound_pagelist)
> > +                                             struct list_head *compound_pagelist, int order)
> >   {
> >       struct folio *src, *tmp;
> >       pte_t *_pte;
> >       pte_t pteval;
> >
> > -     for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> > +     for (_pte = pte; _pte < pte + (1UL << order);
> >            _pte++, address += PAGE_SIZE) {
> >               pteval = ptep_get(_pte);
> >               if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
> >                       add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
> >                       if (is_zero_pfn(pte_pfn(pteval))) {
> > -                             /*
> > -                              * ptl mostly unnecessary.
> > -                              */
> > -                             spin_lock(ptl);
> > -                             ptep_clear(vma->vm_mm, address, _pte);
> > -                             spin_unlock(ptl);
> > +                             if (order == HPAGE_PMD_ORDER) {
> > +                                     /*
> > +                                     * ptl mostly unnecessary.
> > +                                     */
> > +                                     spin_lock(ptl);
> > +                                     ptep_clear(vma->vm_mm, address, _pte);
> > +                                     spin_unlock(ptl);
> > +                             } else {
> > +                                     ptep_clear(vma->vm_mm, address, _pte);
> > +                             }
> >                               ksm_might_unmap_zero_page(vma->vm_mm, pteval);
> >                       }
> >               } else {
> > @@ -740,15 +744,20 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> >                       src = page_folio(src_page);
> >                       if (!folio_test_large(src))
> >                               release_pte_folio(src);
> > -                     /*
> > -                      * ptl mostly unnecessary, but preempt has to
> > -                      * be disabled to update the per-cpu stats
> > -                      * inside folio_remove_rmap_pte().
> > -                      */
> > -                     spin_lock(ptl);
> > -                     ptep_clear(vma->vm_mm, address, _pte);
> > -                     folio_remove_rmap_pte(src, src_page, vma);
> > -                     spin_unlock(ptl);
> > +                     if (order == HPAGE_PMD_ORDER) {
> > +                             /*
> > +                             * ptl mostly unnecessary, but preempt has to
> > +                             * be disabled to update the per-cpu stats
> > +                             * inside folio_remove_rmap_pte().
> > +                             */
> > +                             spin_lock(ptl);
> > +                             ptep_clear(vma->vm_mm, address, _pte);
>
>
>
>
> > +                             folio_remove_rmap_pte(src, src_page, vma);
> > +                             spin_unlock(ptl);

I think it is ok not to take the ptl since the preempt is disabled at
this point by pte_map(). pte_unmap() is called after copy.

> > +                     } else {
> > +                             ptep_clear(vma->vm_mm, address, _pte);
> > +                             folio_remove_rmap_pte(src, src_page, vma);
> > +                     }
>
> As I've talked to Nico about this code recently ... :)
>
> Are you clearing the PTE after the copy succeeded? If so, where is the
> TLB flush?
>
> How do you sync against concurrent write acess + GUP-fast?
>
>
> The sequence really must be: (1) clear PTE/PMD + flush TLB (2) check if
> there are unexpected page references (e.g., GUP) if so back off (3)
> copy page content (4) set updated PTE/PMD.

Yeah, either PMD is not cleared or tlb_remove_table_sync_one() is not
called IIRC, the concurrent GUP may change the refcount after the
refcount check.

>
> To Nico, I suggested doing it simple initially, and still clear the
> high-level PMD entry + flush under mmap write lock, then re-map the PTE
> table after modifying the page table. It's not as efficient, but "harder
> to get wrong".
>
> Maybe that's already happening, but I stumbled over this clearing logic
> in __collapse_huge_page_copy_succeeded(), so I'm curious.
>
> --
> Cheers,
>
> David / dhildenb
>
>