Scale down the scan range and the sysfs tunables according to the scan order,
and isolate the folios.
Signed-off-by: Dev Jain <dev.jain@arm.com>
---
mm/khugepaged.c | 19 +++++++++++--------
1 file changed, 11 insertions(+), 8 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f52dae7d5179..de044b1f83d4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -564,15 +564,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte,
struct collapse_control *cc,
- struct list_head *compound_pagelist)
+ struct list_head *compound_pagelist, int order)
{
- struct page *page = NULL;
- struct folio *folio = NULL;
- pte_t *_pte;
+ unsigned int max_ptes_shared = khugepaged_max_ptes_shared >> (HPAGE_PMD_ORDER - order);
+ unsigned int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
+ struct folio *folio = NULL;
+ struct page *page = NULL;
bool writable = false;
+ pte_t *_pte;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+
+ for (_pte = pte; _pte < pte + (1UL << order);
_pte++, address += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
if (pte_none(pteval) || (pte_present(pteval) &&
@@ -580,7 +583,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
++none_or_zero;
if (!userfaultfd_armed(vma) &&
(!cc->is_khugepaged ||
- none_or_zero <= khugepaged_max_ptes_none)) {
+ none_or_zero <= max_ptes_none)) {
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
@@ -609,7 +612,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (folio_likely_mapped_shared(folio)) {
++shared;
if (cc->is_khugepaged &&
- shared > khugepaged_max_ptes_shared) {
+ shared > max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out;
@@ -1200,7 +1203,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
if (pte) {
result = __collapse_huge_page_isolate(vma, address, pte, cc,
- &compound_pagelist);
+ &compound_pagelist, order);
spin_unlock(pte_ptl);
} else {
result = SCAN_PMD_NULL;
--
2.30.2
On 16/12/2024 16:50, Dev Jain wrote:
> Scale down the scan range and the sysfs tunables according to the scan order,
> and isolate the folios.
>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
> mm/khugepaged.c | 19 +++++++++++--------
> 1 file changed, 11 insertions(+), 8 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index f52dae7d5179..de044b1f83d4 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -564,15 +564,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> unsigned long address,
> pte_t *pte,
> struct collapse_control *cc,
> - struct list_head *compound_pagelist)
> + struct list_head *compound_pagelist, int order)
> {
> - struct page *page = NULL;
> - struct folio *folio = NULL;
> - pte_t *_pte;
> + unsigned int max_ptes_shared = khugepaged_max_ptes_shared >> (HPAGE_PMD_ORDER - order);
> + unsigned int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
This is implicitly rounding down. I think that's the right thing to do; it's
better to be conservative.
> int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
> + struct folio *folio = NULL;
> + struct page *page = NULL;
> bool writable = false;
> + pte_t *_pte;
>
> - for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> +
> + for (_pte = pte; _pte < pte + (1UL << order);
> _pte++, address += PAGE_SIZE) {
> pte_t pteval = ptep_get(_pte);
> if (pte_none(pteval) || (pte_present(pteval) &&
> @@ -580,7 +583,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> ++none_or_zero;
> if (!userfaultfd_armed(vma) &&
> (!cc->is_khugepaged ||
> - none_or_zero <= khugepaged_max_ptes_none)) {
> + none_or_zero <= max_ptes_none)) {
> continue;
> } else {
> result = SCAN_EXCEED_NONE_PTE;
> @@ -609,7 +612,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> if (folio_likely_mapped_shared(folio)) {
> ++shared;
> if (cc->is_khugepaged &&
> - shared > khugepaged_max_ptes_shared) {
> + shared > max_ptes_shared) {
> result = SCAN_EXCEED_SHARED_PTE;
> count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> goto out;
> @@ -1200,7 +1203,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> if (pte) {
> result = __collapse_huge_page_isolate(vma, address, pte, cc,
> - &compound_pagelist);
> + &compound_pagelist, order);
> spin_unlock(pte_ptl);
> } else {
> result = SCAN_PMD_NULL;
On Mon, Dec 16, 2024 at 10:20:58PM +0530, Dev Jain wrote:
> {
> - struct page *page = NULL;
> - struct folio *folio = NULL;
> - pte_t *_pte;
> + unsigned int max_ptes_shared = khugepaged_max_ptes_shared >> (HPAGE_PMD_ORDER - order);
> + unsigned int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
> int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
> + struct folio *folio = NULL;
> + struct page *page = NULL;
why are you moving variables around unnecessarily?
> bool writable = false;
> + pte_t *_pte;
>
> - for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> +
> + for (_pte = pte; _pte < pte + (1UL << order);
spurious blank line
also you might first want to finish off the page->folio conversion in
this function first; we have a vm_normal_folio() now.
On 17/12/24 10:02 am, Matthew Wilcox wrote:
> On Mon, Dec 16, 2024 at 10:20:58PM +0530, Dev Jain wrote:
>> {
>> - struct page *page = NULL;
>> - struct folio *folio = NULL;
>> - pte_t *_pte;
>> + unsigned int max_ptes_shared = khugepaged_max_ptes_shared >> (HPAGE_PMD_ORDER - order);
>> + unsigned int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
>> int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
>> + struct folio *folio = NULL;
>> + struct page *page = NULL;
> why are you moving variables around unnecessarily?
In a previous work, I moved code around and David noted to arrange the declarations
in reverse Xmas tree order. I guess (?) that was not spoiling git history, so if
this feels like that, I will revert.
>
>> bool writable = false;
>> + pte_t *_pte;
>>
>> - for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
>> +
>> + for (_pte = pte; _pte < pte + (1UL << order);
> spurious blank line
My bad
>
>
> also you might first want to finish off the page->folio conversion in
> this function first; we have a vm_normal_folio() now.
I did not add any code before we derive the folio...I'm sorry, I don't get what you mean...
On 17/12/2024 06:41, Dev Jain wrote:
>
> On 17/12/24 10:02 am, Matthew Wilcox wrote:
>> On Mon, Dec 16, 2024 at 10:20:58PM +0530, Dev Jain wrote:
>>> {
>>> - struct page *page = NULL;
>>> - struct folio *folio = NULL;
>>> - pte_t *_pte;
>>> + unsigned int max_ptes_shared = khugepaged_max_ptes_shared >>
>>> (HPAGE_PMD_ORDER - order);
>>> + unsigned int max_ptes_none = khugepaged_max_ptes_none >>
>>> (HPAGE_PMD_ORDER - order);
>>> int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
>>> + struct folio *folio = NULL;
>>> + struct page *page = NULL;
>> why are you moving variables around unnecessarily?
>
> In a previous work, I moved code around and David noted to arrange the declarations
> in reverse Xmas tree order. I guess (?) that was not spoiling git history, so if
> this feels like that, I will revert.
>
>>
>>> bool writable = false;
>>> + pte_t *_pte;
>>> - for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
>>> +
>>> + for (_pte = pte; _pte < pte + (1UL << order);
>> spurious blank line
>
> My bad
>
>>
>>
>> also you might first want to finish off the page->folio conversion in
>> this function first; we have a vm_normal_folio() now.
>
> I did not add any code before we derive the folio...I'm sorry, I don't get what
> you mean...
>
I think Matthew is suggesting helping out with the folio to page conversion work
while you are working on this function. I think it would amount to a patch that
does something like this:
----8<-----
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ffc4d5aef991..d94e05754140 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -568,7 +568,6 @@ static int __collapse_huge_page_isolate(struct
vm_area_struct *vma,
unsigned int max_ptes_none = khugepaged_max_ptes_none >>
(HPAGE_PMD_ORDER - order);
int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
struct folio *folio = NULL;
- struct page *page = NULL;
bool writable = false;
pte_t *_pte;
@@ -597,13 +596,12 @@ static int __collapse_huge_page_isolate(struct
vm_area_struct *vma,
result = SCAN_PTE_UFFD_WP;
goto out;
}
- page = vm_normal_page(vma, address, pteval);
- if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
+ folio = vm_normal_folio(vma, address, pteval);
+ if (unlikely(!folio) || unlikely(folio_is_zone_device(folio))) {
result = SCAN_PAGE_NULL;
goto out;
}
- folio = page_folio(page);
VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
if (order !=HPAGE_PMD_ORDER && folio_order(folio) >= order) {
----8<-----
© 2016 - 2026 Red Hat, Inc.