The following data is traced by bpftrace on a desktop system. After
the system has been left idle for 10 minutes upon booting, a lot of
SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by
khugepaged.
@scan_pmd_status[1]: 1 ## SCAN_SUCCEED
@scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED
@scan_pmd_status[3]: 174 ## SCAN_PMD_NONE
total progress size: 701 MB
Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs
The khugepaged_scan list save all task that support collapse into hugepage,
as long as the take is not destroyed, khugepaged will not remove it from
the khugepaged_scan list. This exist a phenomenon where task has already
collapsed all memory regions into hugepage, but khugepaged continues to
scan it, which wastes CPU time and invalid, and due to
khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
scanning a large number of invalid task, so scanning really valid task
is later.
After applying this patch, when all memory is either SCAN_PMD_MAPPED or
SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan
list. If the page fault or MADV_HUGEPAGE again, it is added back to
khugepaged.
Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
---
mm/khugepaged.c | 35 +++++++++++++++++++++++++----------
1 file changed, 25 insertions(+), 10 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0598a19a98cc..1ec1af5be3c8 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -115,6 +115,7 @@ struct khugepaged_scan {
struct list_head mm_head;
struct mm_slot *mm_slot;
unsigned long address;
+ bool maybe_collapse;
};
static struct khugepaged_scan khugepaged_scan = {
@@ -1420,22 +1421,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
return result;
}
-static void collect_mm_slot(struct mm_slot *slot)
+static void collect_mm_slot(struct mm_slot *slot, bool maybe_collapse)
{
struct mm_struct *mm = slot->mm;
lockdep_assert_held(&khugepaged_mm_lock);
- if (hpage_collapse_test_exit(mm)) {
+ if (hpage_collapse_test_exit(mm) || !maybe_collapse) {
/* free mm_slot */
hash_del(&slot->hash);
list_del(&slot->mm_node);
- /*
- * Not strictly needed because the mm exited already.
- *
- * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
- */
+ if (!maybe_collapse)
+ mm_flags_clear(MMF_VM_HUGEPAGE, mm);
/* khugepaged_mm_lock actually not necessary for the below */
mm_slot_free(mm_slot_cache, slot);
@@ -2397,6 +2395,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
struct mm_slot, mm_node);
khugepaged_scan.address = 0;
khugepaged_scan.mm_slot = slot;
+ khugepaged_scan.maybe_collapse = false;
}
spin_unlock(&khugepaged_mm_lock);
@@ -2470,8 +2469,18 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
khugepaged_scan.address, &mmap_locked, cc);
}
- if (*result == SCAN_SUCCEED)
+ switch (*result) {
+ case SCAN_PMD_NULL:
+ case SCAN_PMD_NONE:
+ case SCAN_PMD_MAPPED:
+ case SCAN_PTE_MAPPED_HUGEPAGE:
+ break;
+ case SCAN_SUCCEED:
++khugepaged_pages_collapsed;
+ fallthrough;
+ default:
+ khugepaged_scan.maybe_collapse = true;
+ }
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
@@ -2500,6 +2509,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
* if we scanned all vmas of this mm.
*/
if (hpage_collapse_test_exit(mm) || !vma) {
+ bool maybe_collapse = khugepaged_scan.maybe_collapse;
+
+ if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
+ maybe_collapse = true;
+
/*
* Make sure that if mm_users is reaching zero while
* khugepaged runs here, khugepaged_exit will find
@@ -2508,12 +2522,13 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
khugepaged_scan.address = 0;
+ khugepaged_scan.maybe_collapse = false;
} else {
khugepaged_scan.mm_slot = NULL;
khugepaged_full_scans++;
}
- collect_mm_slot(slot);
+ collect_mm_slot(slot, maybe_collapse);
}
trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL);
@@ -2616,7 +2631,7 @@ static int khugepaged(void *none)
slot = khugepaged_scan.mm_slot;
khugepaged_scan.mm_slot = NULL;
if (slot)
- collect_mm_slot(slot);
+ collect_mm_slot(slot, true);
spin_unlock(&khugepaged_mm_lock);
return 0;
}
--
2.51.0
Hi Vernon,
kernel test robot noticed the following build errors:
[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.19-rc1 next-20251215]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Vernon-Yang/mm-khugepaged-add-trace_mm_khugepaged_scan-event/20251215-171046
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20251215090419.174418-3-yanglincheng%40kylinos.cn
patch subject: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
config: x86_64-rhel-9.4 (https://download.01.org/0day-ci/archive/20251216/202512160619.3Ut4sxaJ-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251216/202512160619.3Ut4sxaJ-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512160619.3Ut4sxaJ-lkp@intel.com/
All errors (new ones prefixed by >>):
mm/khugepaged.c: In function 'khugepaged_scan_mm_slot':
>> mm/khugepaged.c:2490:30: error: 'SCAN_PMD_NULL' undeclared (first use in this function); did you mean 'SCAN_VMA_NULL'?
2490 | case SCAN_PMD_NULL:
| ^~~~~~~~~~~~~
| SCAN_VMA_NULL
mm/khugepaged.c:2490:30: note: each undeclared identifier is reported only once for each function it appears in
>> mm/khugepaged.c:2491:30: error: 'SCAN_PMD_NONE' undeclared (first use in this function)
2491 | case SCAN_PMD_NONE:
| ^~~~~~~~~~~~~
vim +2490 mm/khugepaged.c
2392
2393 static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
2394 struct collapse_control *cc)
2395 __releases(&khugepaged_mm_lock)
2396 __acquires(&khugepaged_mm_lock)
2397 {
2398 struct vma_iterator vmi;
2399 struct mm_slot *slot;
2400 struct mm_struct *mm;
2401 struct vm_area_struct *vma;
2402 int progress = 0;
2403
2404 VM_BUG_ON(!pages);
2405 lockdep_assert_held(&khugepaged_mm_lock);
2406 *result = SCAN_FAIL;
2407
2408 if (khugepaged_scan.mm_slot) {
2409 slot = khugepaged_scan.mm_slot;
2410 } else {
2411 slot = list_first_entry(&khugepaged_scan.mm_head,
2412 struct mm_slot, mm_node);
2413 khugepaged_scan.address = 0;
2414 khugepaged_scan.mm_slot = slot;
2415 khugepaged_scan.maybe_collapse = false;
2416 }
2417 spin_unlock(&khugepaged_mm_lock);
2418
2419 mm = slot->mm;
2420 /*
2421 * Don't wait for semaphore (to avoid long wait times). Just move to
2422 * the next mm on the list.
2423 */
2424 vma = NULL;
2425 if (unlikely(!mmap_read_trylock(mm)))
2426 goto breakouterloop_mmap_lock;
2427
2428 progress++;
2429 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
2430 goto breakouterloop;
2431
2432 vma_iter_init(&vmi, mm, khugepaged_scan.address);
2433 for_each_vma(vmi, vma) {
2434 unsigned long hstart, hend;
2435
2436 cond_resched();
2437 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
2438 progress++;
2439 break;
2440 }
2441 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
2442 skip:
2443 progress++;
2444 continue;
2445 }
2446 hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
2447 hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
2448 if (khugepaged_scan.address > hend)
2449 goto skip;
2450 if (khugepaged_scan.address < hstart)
2451 khugepaged_scan.address = hstart;
2452 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2453
2454 while (khugepaged_scan.address < hend) {
2455 bool mmap_locked = true;
2456
2457 cond_resched();
2458 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
2459 goto breakouterloop;
2460
2461 VM_BUG_ON(khugepaged_scan.address < hstart ||
2462 khugepaged_scan.address + HPAGE_PMD_SIZE >
2463 hend);
2464 if (!vma_is_anonymous(vma)) {
2465 struct file *file = get_file(vma->vm_file);
2466 pgoff_t pgoff = linear_page_index(vma,
2467 khugepaged_scan.address);
2468
2469 mmap_read_unlock(mm);
2470 mmap_locked = false;
2471 *result = hpage_collapse_scan_file(mm,
2472 khugepaged_scan.address, file, pgoff, cc);
2473 fput(file);
2474 if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
2475 mmap_read_lock(mm);
2476 if (hpage_collapse_test_exit_or_disable(mm))
2477 goto breakouterloop;
2478 *result = collapse_pte_mapped_thp(mm,
2479 khugepaged_scan.address, false);
2480 if (*result == SCAN_PMD_MAPPED)
2481 *result = SCAN_SUCCEED;
2482 mmap_read_unlock(mm);
2483 }
2484 } else {
2485 *result = hpage_collapse_scan_pmd(mm, vma,
2486 khugepaged_scan.address, &mmap_locked, cc);
2487 }
2488
2489 switch (*result) {
> 2490 case SCAN_PMD_NULL:
> 2491 case SCAN_PMD_NONE:
2492 case SCAN_PMD_MAPPED:
2493 case SCAN_PTE_MAPPED_HUGEPAGE:
2494 break;
2495 case SCAN_SUCCEED:
2496 ++khugepaged_pages_collapsed;
2497 fallthrough;
2498 default:
2499 khugepaged_scan.maybe_collapse = true;
2500 }
2501
2502 /* move to next address */
2503 khugepaged_scan.address += HPAGE_PMD_SIZE;
2504 progress += HPAGE_PMD_NR;
2505 if (!mmap_locked)
2506 /*
2507 * We released mmap_lock so break loop. Note
2508 * that we drop mmap_lock before all hugepage
2509 * allocations, so if allocation fails, we are
2510 * guaranteed to break here and report the
2511 * correct result back to caller.
2512 */
2513 goto breakouterloop_mmap_lock;
2514 if (progress >= pages)
2515 goto breakouterloop;
2516 }
2517 }
2518 breakouterloop:
2519 mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
2520 breakouterloop_mmap_lock:
2521
2522 spin_lock(&khugepaged_mm_lock);
2523 VM_BUG_ON(khugepaged_scan.mm_slot != slot);
2524 /*
2525 * Release the current mm_slot if this mm is about to die, or
2526 * if we scanned all vmas of this mm.
2527 */
2528 if (hpage_collapse_test_exit(mm) || !vma) {
2529 bool maybe_collapse = khugepaged_scan.maybe_collapse;
2530
2531 if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
2532 maybe_collapse = true;
2533
2534 /*
2535 * Make sure that if mm_users is reaching zero while
2536 * khugepaged runs here, khugepaged_exit will find
2537 * mm_slot not pointing to the exiting mm.
2538 */
2539 if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
2540 khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
2541 khugepaged_scan.address = 0;
2542 khugepaged_scan.maybe_collapse = false;
2543 } else {
2544 khugepaged_scan.mm_slot = NULL;
2545 khugepaged_full_scans++;
2546 }
2547
2548 collect_mm_slot(slot, maybe_collapse);
2549 }
2550
2551 trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL);
2552
2553 return progress;
2554 }
2555
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi Vernon,
kernel test robot noticed the following build errors:
[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.19-rc1 next-20251215]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Vernon-Yang/mm-khugepaged-add-trace_mm_khugepaged_scan-event/20251215-171046
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20251215090419.174418-3-yanglincheng%40kylinos.cn
patch subject: [PATCH 2/4] mm: khugepaged: remove mm when all memory has been collapsed
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20251216/202512160533.KuHwyJTP-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251216/202512160533.KuHwyJTP-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512160533.KuHwyJTP-lkp@intel.com/
All errors (new ones prefixed by >>):
>> mm/khugepaged.c:2490:9: error: use of undeclared identifier 'SCAN_PMD_NULL'; did you mean 'SCAN_VMA_NULL'?
2490 | case SCAN_PMD_NULL:
| ^~~~~~~~~~~~~
| SCAN_VMA_NULL
mm/khugepaged.c:50:2: note: 'SCAN_VMA_NULL' declared here
50 | SCAN_VMA_NULL,
| ^
>> mm/khugepaged.c:2491:9: error: use of undeclared identifier 'SCAN_PMD_NONE'
2491 | case SCAN_PMD_NONE:
| ^
2 errors generated.
vim +2490 mm/khugepaged.c
2392
2393 static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
2394 struct collapse_control *cc)
2395 __releases(&khugepaged_mm_lock)
2396 __acquires(&khugepaged_mm_lock)
2397 {
2398 struct vma_iterator vmi;
2399 struct mm_slot *slot;
2400 struct mm_struct *mm;
2401 struct vm_area_struct *vma;
2402 int progress = 0;
2403
2404 VM_BUG_ON(!pages);
2405 lockdep_assert_held(&khugepaged_mm_lock);
2406 *result = SCAN_FAIL;
2407
2408 if (khugepaged_scan.mm_slot) {
2409 slot = khugepaged_scan.mm_slot;
2410 } else {
2411 slot = list_first_entry(&khugepaged_scan.mm_head,
2412 struct mm_slot, mm_node);
2413 khugepaged_scan.address = 0;
2414 khugepaged_scan.mm_slot = slot;
2415 khugepaged_scan.maybe_collapse = false;
2416 }
2417 spin_unlock(&khugepaged_mm_lock);
2418
2419 mm = slot->mm;
2420 /*
2421 * Don't wait for semaphore (to avoid long wait times). Just move to
2422 * the next mm on the list.
2423 */
2424 vma = NULL;
2425 if (unlikely(!mmap_read_trylock(mm)))
2426 goto breakouterloop_mmap_lock;
2427
2428 progress++;
2429 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
2430 goto breakouterloop;
2431
2432 vma_iter_init(&vmi, mm, khugepaged_scan.address);
2433 for_each_vma(vmi, vma) {
2434 unsigned long hstart, hend;
2435
2436 cond_resched();
2437 if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
2438 progress++;
2439 break;
2440 }
2441 if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
2442 skip:
2443 progress++;
2444 continue;
2445 }
2446 hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
2447 hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
2448 if (khugepaged_scan.address > hend)
2449 goto skip;
2450 if (khugepaged_scan.address < hstart)
2451 khugepaged_scan.address = hstart;
2452 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2453
2454 while (khugepaged_scan.address < hend) {
2455 bool mmap_locked = true;
2456
2457 cond_resched();
2458 if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
2459 goto breakouterloop;
2460
2461 VM_BUG_ON(khugepaged_scan.address < hstart ||
2462 khugepaged_scan.address + HPAGE_PMD_SIZE >
2463 hend);
2464 if (!vma_is_anonymous(vma)) {
2465 struct file *file = get_file(vma->vm_file);
2466 pgoff_t pgoff = linear_page_index(vma,
2467 khugepaged_scan.address);
2468
2469 mmap_read_unlock(mm);
2470 mmap_locked = false;
2471 *result = hpage_collapse_scan_file(mm,
2472 khugepaged_scan.address, file, pgoff, cc);
2473 fput(file);
2474 if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
2475 mmap_read_lock(mm);
2476 if (hpage_collapse_test_exit_or_disable(mm))
2477 goto breakouterloop;
2478 *result = collapse_pte_mapped_thp(mm,
2479 khugepaged_scan.address, false);
2480 if (*result == SCAN_PMD_MAPPED)
2481 *result = SCAN_SUCCEED;
2482 mmap_read_unlock(mm);
2483 }
2484 } else {
2485 *result = hpage_collapse_scan_pmd(mm, vma,
2486 khugepaged_scan.address, &mmap_locked, cc);
2487 }
2488
2489 switch (*result) {
> 2490 case SCAN_PMD_NULL:
> 2491 case SCAN_PMD_NONE:
2492 case SCAN_PMD_MAPPED:
2493 case SCAN_PTE_MAPPED_HUGEPAGE:
2494 break;
2495 case SCAN_SUCCEED:
2496 ++khugepaged_pages_collapsed;
2497 fallthrough;
2498 default:
2499 khugepaged_scan.maybe_collapse = true;
2500 }
2501
2502 /* move to next address */
2503 khugepaged_scan.address += HPAGE_PMD_SIZE;
2504 progress += HPAGE_PMD_NR;
2505 if (!mmap_locked)
2506 /*
2507 * We released mmap_lock so break loop. Note
2508 * that we drop mmap_lock before all hugepage
2509 * allocations, so if allocation fails, we are
2510 * guaranteed to break here and report the
2511 * correct result back to caller.
2512 */
2513 goto breakouterloop_mmap_lock;
2514 if (progress >= pages)
2515 goto breakouterloop;
2516 }
2517 }
2518 breakouterloop:
2519 mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
2520 breakouterloop_mmap_lock:
2521
2522 spin_lock(&khugepaged_mm_lock);
2523 VM_BUG_ON(khugepaged_scan.mm_slot != slot);
2524 /*
2525 * Release the current mm_slot if this mm is about to die, or
2526 * if we scanned all vmas of this mm.
2527 */
2528 if (hpage_collapse_test_exit(mm) || !vma) {
2529 bool maybe_collapse = khugepaged_scan.maybe_collapse;
2530
2531 if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
2532 maybe_collapse = true;
2533
2534 /*
2535 * Make sure that if mm_users is reaching zero while
2536 * khugepaged runs here, khugepaged_exit will find
2537 * mm_slot not pointing to the exiting mm.
2538 */
2539 if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
2540 khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
2541 khugepaged_scan.address = 0;
2542 khugepaged_scan.maybe_collapse = false;
2543 } else {
2544 khugepaged_scan.mm_slot = NULL;
2545 khugepaged_full_scans++;
2546 }
2547
2548 collect_mm_slot(slot, maybe_collapse);
2549 }
2550
2551 trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL);
2552
2553 return progress;
2554 }
2555
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Hi Vernon,
Thanks for the patches!
On 2025/12/15 17:04, Vernon Yang wrote:
> The following data is traced by bpftrace on a desktop system. After
> the system has been left idle for 10 minutes upon booting, a lot of
> SCAN_PMD_MAPPED or SCAN_PMD_NONE are observed during a full scan by
> khugepaged.
>
> @scan_pmd_status[1]: 1 ## SCAN_SUCCEED
> @scan_pmd_status[4]: 158 ## SCAN_PMD_MAPPED
> @scan_pmd_status[3]: 174 ## SCAN_PMD_NONE
> total progress size: 701 MB
> Total time : 440 seconds ## include khugepaged_scan_sleep_millisecs
>
> The khugepaged_scan list save all task that support collapse into hugepage,
> as long as the take is not destroyed, khugepaged will not remove it from
Nit: s/take/task/
> the khugepaged_scan list. This exist a phenomenon where task has already
> collapsed all memory regions into hugepage, but khugepaged continues to
> scan it, which wastes CPU time and invalid, and due to
> khugepaged_scan_sleep_millisecs (default 10s) causes a long wait for
> scanning a large number of invalid task, so scanning really valid task
> is later.
>
> After applying this patch, when all memory is either SCAN_PMD_MAPPED or
> SCAN_PMD_NONE, the mm is automatically removed from khugepaged's scan
> list. If the page fault or MADV_HUGEPAGE again, it is added back to
> khugepaged.
>
> Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> ---
> mm/khugepaged.c | 35 +++++++++++++++++++++++++----------
> 1 file changed, 25 insertions(+), 10 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 0598a19a98cc..1ec1af5be3c8 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -115,6 +115,7 @@ struct khugepaged_scan {
> struct list_head mm_head;
> struct mm_slot *mm_slot;
> unsigned long address;
> + bool maybe_collapse;
At a quick glance, the name of "maybe_collapse" is a bit ambiguous ...
Perhaps "scan_needed" or "collapse_possible" would be clearer to
indicate that the mm should be kept in the scan list?
> };
>
> static struct khugepaged_scan khugepaged_scan = {
> @@ -1420,22 +1421,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
> return result;
> }
>
> -static void collect_mm_slot(struct mm_slot *slot)
> +static void collect_mm_slot(struct mm_slot *slot, bool maybe_collapse)
> {
> struct mm_struct *mm = slot->mm;
>
> lockdep_assert_held(&khugepaged_mm_lock);
>
> - if (hpage_collapse_test_exit(mm)) {
> + if (hpage_collapse_test_exit(mm) || !maybe_collapse) {
> /* free mm_slot */
> hash_del(&slot->hash);
> list_del(&slot->mm_node);
>
> - /*
> - * Not strictly needed because the mm exited already.
> - *
> - * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
> - */
> + if (!maybe_collapse)
> + mm_flags_clear(MMF_VM_HUGEPAGE, mm);
>
> /* khugepaged_mm_lock actually not necessary for the below */
> mm_slot_free(mm_slot_cache, slot);
> @@ -2397,6 +2395,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> struct mm_slot, mm_node);
> khugepaged_scan.address = 0;
> khugepaged_scan.mm_slot = slot;
> + khugepaged_scan.maybe_collapse = false;
> }
> spin_unlock(&khugepaged_mm_lock);
>
> @@ -2470,8 +2469,18 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> khugepaged_scan.address, &mmap_locked, cc);
> }
>
> - if (*result == SCAN_SUCCEED)
> + switch (*result) {
> + case SCAN_PMD_NULL:
> + case SCAN_PMD_NONE:
> + case SCAN_PMD_MAPPED:
> + case SCAN_PTE_MAPPED_HUGEPAGE:
> + break;
> + case SCAN_SUCCEED:
> ++khugepaged_pages_collapsed;
> + fallthrough;
> + default:
> + khugepaged_scan.maybe_collapse = true;
> + }
>
> /* move to next address */
> khugepaged_scan.address += HPAGE_PMD_SIZE;
> @@ -2500,6 +2509,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> * if we scanned all vmas of this mm.
> */
> if (hpage_collapse_test_exit(mm) || !vma) {
> + bool maybe_collapse = khugepaged_scan.maybe_collapse;
> +
> + if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
> + maybe_collapse = true;
> +
> /*
> * Make sure that if mm_users is reaching zero while
> * khugepaged runs here, khugepaged_exit will find
> @@ -2508,12 +2522,13 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
> if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
> khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
> khugepaged_scan.address = 0;
> + khugepaged_scan.maybe_collapse = false;
> } else {
> khugepaged_scan.mm_slot = NULL;
> khugepaged_full_scans++;
> }
>
> - collect_mm_slot(slot);
> + collect_mm_slot(slot, maybe_collapse);
> }
>
> trace_mm_khugepaged_scan(mm, progress, khugepaged_scan.mm_slot == NULL);
> @@ -2616,7 +2631,7 @@ static int khugepaged(void *none)
> slot = khugepaged_scan.mm_slot;
> khugepaged_scan.mm_slot = NULL;
> if (slot)
> - collect_mm_slot(slot);
> + collect_mm_slot(slot, true);
> spin_unlock(&khugepaged_mm_lock);
> return 0;
> }
© 2016 - 2025 Red Hat, Inc.