From: Vernon Yang <yanglincheng@kylinos.cn>
For example, create three task: hot1 -> cold -> hot2. After all three
task are created, each allocate memory 128MB. the hot1/hot2 task
continuously access 128 MB memory, while the cold task only accesses
its memory briefly andthen call madvise(MADV_FREE). However, khugepaged
still prioritizes scanning the cold task and only scans the hot2 task
after completing the scan of the cold task.
And if we collapse with a lazyfree page, that content will never be none
and the deferred shrinker cannot reclaim them.
So if the user has explicitly informed us via MADV_FREE that this memory
will be freed, it is appropriate for khugepaged to skip it only, thereby
avoiding unnecessary scan and collapse operations to reducing CPU
wastage.
Here are the performance test results:
(Throughput bigger is better, other smaller is better)
Testing on x86_64 machine:
| task hot2 | without patch | with patch | delta |
|---------------------|---------------|---------------|---------|
| total accesses time | 3.14 sec | 2.93 sec | -6.69% |
| cycles per access | 4.96 | 2.21 | -55.44% |
| Throughput | 104.38 M/sec | 111.89 M/sec | +7.19% |
| dTLB-load-misses | 284814532 | 69597236 | -75.56% |
Testing on qemu-system-x86_64 -enable-kvm:
| task hot2 | without patch | with patch | delta |
|---------------------|---------------|---------------|---------|
| total accesses time | 3.35 sec | 2.96 sec | -11.64% |
| cycles per access | 7.29 | 2.07 | -71.60% |
| Throughput | 97.67 M/sec | 110.77 M/sec | +13.41% |
| dTLB-load-misses | 241600871 | 3216108 | -98.67% |
Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
---
include/trace/events/huge_memory.h | 1 +
mm/khugepaged.c | 13 +++++++++++++
2 files changed, 14 insertions(+)
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 384e29f6bef0..bcdc57eea270 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -25,6 +25,7 @@
EM( SCAN_PAGE_LRU, "page_not_in_lru") \
EM( SCAN_PAGE_LOCK, "page_locked") \
EM( SCAN_PAGE_ANON, "page_not_anon") \
+ EM( SCAN_PAGE_LAZYFREE, "page_lazyfree") \
EM( SCAN_PAGE_COMPOUND, "page_compound") \
EM( SCAN_ANY_PROCESS, "no_process_for_page") \
EM( SCAN_VMA_NULL, "vma_null") \
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index df22b2274d92..b4def001ccd0 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -46,6 +46,7 @@ enum scan_result {
SCAN_PAGE_LRU,
SCAN_PAGE_LOCK,
SCAN_PAGE_ANON,
+ SCAN_PAGE_LAZYFREE,
SCAN_PAGE_COMPOUND,
SCAN_ANY_PROCESS,
SCAN_VMA_NULL,
@@ -583,6 +584,12 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
folio = page_folio(page);
VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
+ if (cc->is_khugepaged && !pte_dirty(pteval) &&
+ folio_test_lazyfree(folio)) {
+ result = SCAN_PAGE_LAZYFREE;
+ goto out;
+ }
+
/* See hpage_collapse_scan_pmd(). */
if (folio_maybe_mapped_shared(folio)) {
++shared;
@@ -1332,6 +1339,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
}
folio = page_folio(page);
+ if (cc->is_khugepaged && !pte_dirty(pteval) &&
+ folio_test_lazyfree(folio)) {
+ result = SCAN_PAGE_LAZYFREE;
+ goto out_unmap;
+ }
+
if (!folio_test_anon(folio)) {
result = SCAN_PAGE_ANON;
goto out_unmap;
--
2.51.0
On 2/1/26 13:25, Vernon Yang wrote:
> From: Vernon Yang <yanglincheng@kylinos.cn>
>
> For example, create three task: hot1 -> cold -> hot2. After all three
> task are created, each allocate memory 128MB. the hot1/hot2 task
> continuously access 128 MB memory, while the cold task only accesses
> its memory briefly andthen call madvise(MADV_FREE). However, khugepaged
> still prioritizes scanning the cold task and only scans the hot2 task
> after completing the scan of the cold task.
>
> And if we collapse with a lazyfree page, that content will never be none
> and the deferred shrinker cannot reclaim them.
>
> So if the user has explicitly informed us via MADV_FREE that this memory
> will be freed, it is appropriate for khugepaged to skip it only, thereby
> avoiding unnecessary scan and collapse operations to reducing CPU
> wastage.
>
> Here are the performance test results:
> (Throughput bigger is better, other smaller is better)
>
> Testing on x86_64 machine:
>
> | task hot2 | without patch | with patch | delta |
> |---------------------|---------------|---------------|---------|
> | total accesses time | 3.14 sec | 2.93 sec | -6.69% |
> | cycles per access | 4.96 | 2.21 | -55.44% |
> | Throughput | 104.38 M/sec | 111.89 M/sec | +7.19% |
> | dTLB-load-misses | 284814532 | 69597236 | -75.56% |
>
> Testing on qemu-system-x86_64 -enable-kvm:
>
> | task hot2 | without patch | with patch | delta |
> |---------------------|---------------|---------------|---------|
> | total accesses time | 3.35 sec | 2.96 sec | -11.64% |
> | cycles per access | 7.29 | 2.07 | -71.60% |
> | Throughput | 97.67 M/sec | 110.77 M/sec | +13.41% |
> | dTLB-load-misses | 241600871 | 3216108 | -98.67% |
>
> Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> ---
> include/trace/events/huge_memory.h | 1 +
> mm/khugepaged.c | 13 +++++++++++++
> 2 files changed, 14 insertions(+)
>
> diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
> index 384e29f6bef0..bcdc57eea270 100644
> --- a/include/trace/events/huge_memory.h
> +++ b/include/trace/events/huge_memory.h
> @@ -25,6 +25,7 @@
> EM( SCAN_PAGE_LRU, "page_not_in_lru") \
> EM( SCAN_PAGE_LOCK, "page_locked") \
> EM( SCAN_PAGE_ANON, "page_not_anon") \
> + EM( SCAN_PAGE_LAZYFREE, "page_lazyfree") \
> EM( SCAN_PAGE_COMPOUND, "page_compound") \
> EM( SCAN_ANY_PROCESS, "no_process_for_page") \
> EM( SCAN_VMA_NULL, "vma_null") \
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index df22b2274d92..b4def001ccd0 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -46,6 +46,7 @@ enum scan_result {
> SCAN_PAGE_LRU,
> SCAN_PAGE_LOCK,
> SCAN_PAGE_ANON,
> + SCAN_PAGE_LAZYFREE,
> SCAN_PAGE_COMPOUND,
> SCAN_ANY_PROCESS,
> SCAN_VMA_NULL,
> @@ -583,6 +584,12 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
> folio = page_folio(page);
> VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
>
> + if (cc->is_khugepaged && !pte_dirty(pteval) &&
> + folio_test_lazyfree(folio)) {
Should be aligned as
if (cc->is_khugepaged && !pte_dirty(pteval) &&
folio_test_lazyfree(folio)) {
But you could just have it in a single line.
> + result = SCAN_PAGE_LAZYFREE;
> + goto out;
> + }
> +
> /* See hpage_collapse_scan_pmd(). */
> if (folio_maybe_mapped_shared(folio)) {
> ++shared;
> @@ -1332,6 +1339,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> }
> folio = page_folio(page);
>
> + if (cc->is_khugepaged && !pte_dirty(pteval) &&
> + folio_test_lazyfree(folio)) {
> + result = SCAN_PAGE_LAZYFREE;
> + goto out_unmap;
> + }
Dito.
> +
> if (!folio_test_anon(folio)) {
> result = SCAN_PAGE_ANON;
> goto out_unmap;
Surprised that there is no need to add checks for SCAN_PAGE_LAZYFREE
anywhere, but it's similar to SCAN_PAGE_LOCK just that we cannot ever
run into it for madvise.
Acked-by: David Hildenbrand (arm) <david@kernel.org>
--
Cheers,
David
On Thu, Feb 5, 2026 at 5:24 AM David Hildenbrand (arm) <david@kernel.org> wrote:
>
> On 2/1/26 13:25, Vernon Yang wrote:
> > From: Vernon Yang <yanglincheng@kylinos.cn>
> >
> > For example, create three task: hot1 -> cold -> hot2. After all three
> > task are created, each allocate memory 128MB. the hot1/hot2 task
> > continuously access 128 MB memory, while the cold task only accesses
> > its memory briefly andthen call madvise(MADV_FREE). However, khugepaged
> > still prioritizes scanning the cold task and only scans the hot2 task
> > after completing the scan of the cold task.
> >
> > And if we collapse with a lazyfree page, that content will never be none
> > and the deferred shrinker cannot reclaim them.
> >
> > So if the user has explicitly informed us via MADV_FREE that this memory
> > will be freed, it is appropriate for khugepaged to skip it only, thereby
> > avoiding unnecessary scan and collapse operations to reducing CPU
> > wastage.
> >
> > Here are the performance test results:
> > (Throughput bigger is better, other smaller is better)
> >
> > Testing on x86_64 machine:
> >
> > | task hot2 | without patch | with patch | delta |
> > |---------------------|---------------|---------------|---------|
> > | total accesses time | 3.14 sec | 2.93 sec | -6.69% |
> > | cycles per access | 4.96 | 2.21 | -55.44% |
> > | Throughput | 104.38 M/sec | 111.89 M/sec | +7.19% |
> > | dTLB-load-misses | 284814532 | 69597236 | -75.56% |
> >
> > Testing on qemu-system-x86_64 -enable-kvm:
> >
> > | task hot2 | without patch | with patch | delta |
> > |---------------------|---------------|---------------|---------|
> > | total accesses time | 3.35 sec | 2.96 sec | -11.64% |
> > | cycles per access | 7.29 | 2.07 | -71.60% |
> > | Throughput | 97.67 M/sec | 110.77 M/sec | +13.41% |
> > | dTLB-load-misses | 241600871 | 3216108 | -98.67% |
> >
> > Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> > ---
> > include/trace/events/huge_memory.h | 1 +
> > mm/khugepaged.c | 13 +++++++++++++
> > 2 files changed, 14 insertions(+)
> >
> > diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
> > index 384e29f6bef0..bcdc57eea270 100644
> > --- a/include/trace/events/huge_memory.h
> > +++ b/include/trace/events/huge_memory.h
> > @@ -25,6 +25,7 @@
> > EM( SCAN_PAGE_LRU, "page_not_in_lru") \
> > EM( SCAN_PAGE_LOCK, "page_locked") \
> > EM( SCAN_PAGE_ANON, "page_not_anon") \
> > + EM( SCAN_PAGE_LAZYFREE, "page_lazyfree") \
> > EM( SCAN_PAGE_COMPOUND, "page_compound") \
> > EM( SCAN_ANY_PROCESS, "no_process_for_page") \
> > EM( SCAN_VMA_NULL, "vma_null") \
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index df22b2274d92..b4def001ccd0 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -46,6 +46,7 @@ enum scan_result {
> > SCAN_PAGE_LRU,
> > SCAN_PAGE_LOCK,
> > SCAN_PAGE_ANON,
> > + SCAN_PAGE_LAZYFREE,
> > SCAN_PAGE_COMPOUND,
> > SCAN_ANY_PROCESS,
> > SCAN_VMA_NULL,
> > @@ -583,6 +584,12 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > folio = page_folio(page);
> > VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
> >
> > + if (cc->is_khugepaged && !pte_dirty(pteval) &&
> > + folio_test_lazyfree(folio)) {
>
> Should be aligned as
>
> if (cc->is_khugepaged && !pte_dirty(pteval) &&
> folio_test_lazyfree(folio)) {
LGTM, Thank you for review and suggestion, I will do it in the next version.
> But you could just have it in a single line.
If it is placed on a single line, it will exceed 80 characters.
> > + result = SCAN_PAGE_LAZYFREE;
> > + goto out;
> > + }
> > +
> > /* See hpage_collapse_scan_pmd(). */
> > if (folio_maybe_mapped_shared(folio)) {
> > ++shared;
> > @@ -1332,6 +1339,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> > }
> > folio = page_folio(page);
> >
> > + if (cc->is_khugepaged && !pte_dirty(pteval) &&
> > + folio_test_lazyfree(folio)) {
> > + result = SCAN_PAGE_LAZYFREE;
> > + goto out_unmap;
> > + }
>
> Dito.
>
> > +
> > if (!folio_test_anon(folio)) {
> > result = SCAN_PAGE_ANON;
> > goto out_unmap;
>
> Surprised that there is no need to add checks for SCAN_PAGE_LAZYFREE
> anywhere, but it's similar to SCAN_PAGE_LOCK just that we cannot ever
> run into it for madvise.
>
> Acked-by: David Hildenbrand (arm) <david@kernel.org>
Thank you for review and explanation.
> --
> Cheers,
>
> David
On 2026/2/1 20:25, Vernon Yang wrote:
> From: Vernon Yang <yanglincheng@kylinos.cn>
>
> For example, create three task: hot1 -> cold -> hot2. After all three
> task are created, each allocate memory 128MB. the hot1/hot2 task
> continuously access 128 MB memory, while the cold task only accesses
> its memory briefly andthen call madvise(MADV_FREE). However, khugepaged
s/andthen/and then/
> still prioritizes scanning the cold task and only scans the hot2 task
> after completing the scan of the cold task.
>
> And if we collapse with a lazyfree page, that content will never be none
> and the deferred shrinker cannot reclaim them.
>
> So if the user has explicitly informed us via MADV_FREE that this memory
> will be freed, it is appropriate for khugepaged to skip it only, thereby
> avoiding unnecessary scan and collapse operations to reducing CPU
> wastage.
>
> Here are the performance test results:
> (Throughput bigger is better, other smaller is better)
>
> Testing on x86_64 machine:
>
> | task hot2 | without patch | with patch | delta |
> |---------------------|---------------|---------------|---------|
> | total accesses time | 3.14 sec | 2.93 sec | -6.69% |
> | cycles per access | 4.96 | 2.21 | -55.44% |
> | Throughput | 104.38 M/sec | 111.89 M/sec | +7.19% |
> | dTLB-load-misses | 284814532 | 69597236 | -75.56% |
>
> Testing on qemu-system-x86_64 -enable-kvm:
>
> | task hot2 | without patch | with patch | delta |
> |---------------------|---------------|---------------|---------|
> | total accesses time | 3.35 sec | 2.96 sec | -11.64% |
> | cycles per access | 7.29 | 2.07 | -71.60% |
> | Throughput | 97.67 M/sec | 110.77 M/sec | +13.41% |
> | dTLB-load-misses | 241600871 | 3216108 | -98.67% |
>
> Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> ---
> include/trace/events/huge_memory.h | 1 +
> mm/khugepaged.c | 13 +++++++++++++
> 2 files changed, 14 insertions(+)
>
> diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
> index 384e29f6bef0..bcdc57eea270 100644
> --- a/include/trace/events/huge_memory.h
> +++ b/include/trace/events/huge_memory.h
> @@ -25,6 +25,7 @@
> EM( SCAN_PAGE_LRU, "page_not_in_lru") \
> EM( SCAN_PAGE_LOCK, "page_locked") \
> EM( SCAN_PAGE_ANON, "page_not_anon") \
> + EM( SCAN_PAGE_LAZYFREE, "page_lazyfree") \
> EM( SCAN_PAGE_COMPOUND, "page_compound") \
> EM( SCAN_ANY_PROCESS, "no_process_for_page") \
> EM( SCAN_VMA_NULL, "vma_null") \
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index df22b2274d92..b4def001ccd0 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -46,6 +46,7 @@ enum scan_result {
> SCAN_PAGE_LRU,
> SCAN_PAGE_LOCK,
> SCAN_PAGE_ANON,
> + SCAN_PAGE_LAZYFREE,
> SCAN_PAGE_COMPOUND,
> SCAN_ANY_PROCESS,
> SCAN_VMA_NULL,
> @@ -583,6 +584,12 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
> folio = page_folio(page);
> VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
>
> + if (cc->is_khugepaged && !pte_dirty(pteval) &&
> + folio_test_lazyfree(folio)) {
> + result = SCAN_PAGE_LAZYFREE;
> + goto out;
> + }
> +
> /* See hpage_collapse_scan_pmd(). */
> if (folio_maybe_mapped_shared(folio)) {
> ++shared;
> @@ -1332,6 +1339,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> }
> folio = page_folio(page);
>
> + if (cc->is_khugepaged && !pte_dirty(pteval) &&
> + folio_test_lazyfree(folio)) {
> + result = SCAN_PAGE_LAZYFREE;
> + goto out_unmap;
> + }
> +
> if (!folio_test_anon(folio)) {
> result = SCAN_PAGE_ANON;
> goto out_unmap;
Nothing else jumped at me, LGTM.
Reviewed-by: Lance Yang <lance.yang@linux.dev>
On Tue, Feb 3, 2026 at 7:23 PM Lance Yang <lance.yang@linux.dev> wrote:
>
> On 2026/2/1 20:25, Vernon Yang wrote:
> > From: Vernon Yang <yanglincheng@kylinos.cn>
> >
> > For example, create three task: hot1 -> cold -> hot2. After all three
> > task are created, each allocate memory 128MB. the hot1/hot2 task
> > continuously access 128 MB memory, while the cold task only accesses
> > its memory briefly andthen call madvise(MADV_FREE). However, khugepaged
>
> s/andthen/and then/
LGTM, Thank you for review and suggestion, I will do it in the next version.
> > still prioritizes scanning the cold task and only scans the hot2 task
> > after completing the scan of the cold task.
> >
> > And if we collapse with a lazyfree page, that content will never be none
> > and the deferred shrinker cannot reclaim them.
> >
> > So if the user has explicitly informed us via MADV_FREE that this memory
> > will be freed, it is appropriate for khugepaged to skip it only, thereby
> > avoiding unnecessary scan and collapse operations to reducing CPU
> > wastage.
> >
> > Here are the performance test results:
> > (Throughput bigger is better, other smaller is better)
> >
> > Testing on x86_64 machine:
> >
> > | task hot2 | without patch | with patch | delta |
> > |---------------------|---------------|---------------|---------|
> > | total accesses time | 3.14 sec | 2.93 sec | -6.69% |
> > | cycles per access | 4.96 | 2.21 | -55.44% |
> > | Throughput | 104.38 M/sec | 111.89 M/sec | +7.19% |
> > | dTLB-load-misses | 284814532 | 69597236 | -75.56% |
> >
> > Testing on qemu-system-x86_64 -enable-kvm:
> >
> > | task hot2 | without patch | with patch | delta |
> > |---------------------|---------------|---------------|---------|
> > | total accesses time | 3.35 sec | 2.96 sec | -11.64% |
> > | cycles per access | 7.29 | 2.07 | -71.60% |
> > | Throughput | 97.67 M/sec | 110.77 M/sec | +13.41% |
> > | dTLB-load-misses | 241600871 | 3216108 | -98.67% |
> >
> > Signed-off-by: Vernon Yang <yanglincheng@kylinos.cn>
> > ---
> > include/trace/events/huge_memory.h | 1 +
> > mm/khugepaged.c | 13 +++++++++++++
> > 2 files changed, 14 insertions(+)
> >
> > diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
> > index 384e29f6bef0..bcdc57eea270 100644
> > --- a/include/trace/events/huge_memory.h
> > +++ b/include/trace/events/huge_memory.h
> > @@ -25,6 +25,7 @@
> > EM( SCAN_PAGE_LRU, "page_not_in_lru") \
> > EM( SCAN_PAGE_LOCK, "page_locked") \
> > EM( SCAN_PAGE_ANON, "page_not_anon") \
> > + EM( SCAN_PAGE_LAZYFREE, "page_lazyfree") \
> > EM( SCAN_PAGE_COMPOUND, "page_compound") \
> > EM( SCAN_ANY_PROCESS, "no_process_for_page") \
> > EM( SCAN_VMA_NULL, "vma_null") \
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index df22b2274d92..b4def001ccd0 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -46,6 +46,7 @@ enum scan_result {
> > SCAN_PAGE_LRU,
> > SCAN_PAGE_LOCK,
> > SCAN_PAGE_ANON,
> > + SCAN_PAGE_LAZYFREE,
> > SCAN_PAGE_COMPOUND,
> > SCAN_ANY_PROCESS,
> > SCAN_VMA_NULL,
> > @@ -583,6 +584,12 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > folio = page_folio(page);
> > VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
> >
> > + if (cc->is_khugepaged && !pte_dirty(pteval) &&
> > + folio_test_lazyfree(folio)) {
> > + result = SCAN_PAGE_LAZYFREE;
> > + goto out;
> > + }
> > +
> > /* See hpage_collapse_scan_pmd(). */
> > if (folio_maybe_mapped_shared(folio)) {
> > ++shared;
> > @@ -1332,6 +1339,12 @@ static enum scan_result hpage_collapse_scan_pmd(struct mm_struct *mm,
> > }
> > folio = page_folio(page);
> >
> > + if (cc->is_khugepaged && !pte_dirty(pteval) &&
> > + folio_test_lazyfree(folio)) {
> > + result = SCAN_PAGE_LAZYFREE;
> > + goto out_unmap;
> > + }
> > +
> > if (!folio_test_anon(folio)) {
> > result = SCAN_PAGE_ANON;
> > goto out_unmap;
>
> Nothing else jumped at me, LGTM.
>
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
© 2016 - 2026 Red Hat, Inc.