[v10] khugepaged: mTHP support

[PATCH v10 12/13] khugepaged: add per-order mTHP khugepaged stats

Posted by Nico Pache 1 month, 2 weeks ago

With mTHP support inplace, let add the per-order mTHP stats for
exceeding NONE, SWAP, and SHARED.

Signed-off-by: Nico Pache <npache@redhat.com>
---
 Documentation/admin-guide/mm/transhuge.rst | 17 +++++++++++++++++
 include/linux/huge_mm.h                    |  3 +++
 mm/huge_memory.c                           |  7 +++++++
 mm/khugepaged.c                            | 16 +++++++++++++---
 4 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index 7ccb93e22852..b85547ac4fe9 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -705,6 +705,23 @@ nr_anon_partially_mapped
        an anonymous THP as "partially mapped" and count it here, even though it
        is not actually partially mapped anymore.
 
+collapse_exceed_swap_pte
+       The number of anonymous THP which contain at least one swap PTE.
+       Currently khugepaged does not support collapsing mTHP regions that
+       contain a swap PTE.
+
+collapse_exceed_none_pte
+       The number of anonymous THP which have exceeded the none PTE threshold.
+       With mTHP collapse, a bitmap is used to gather the state of a PMD region
+       and is then recursively checked from largest to smallest order against
+       the scaled max_ptes_none count. This counter indicates that the next
+       enabled order will be checked.
+
+collapse_exceed_shared_pte
+       The number of anonymous THP which contain at least one shared PTE.
+       Currently khugepaged does not support collapsing mTHP regions that
+       contain a shared PTE.
+
 As the system ages, allocating huge pages may be expensive as the
 system uses memory compaction to copy data around memory to free a
 huge page for use. There are some counters in ``/proc/vmstat`` to help
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 4ada5d1f7297..6f1593d0b4b5 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -144,6 +144,9 @@ enum mthp_stat_item {
 	MTHP_STAT_SPLIT_DEFERRED,
 	MTHP_STAT_NR_ANON,
 	MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
+	MTHP_STAT_COLLAPSE_EXCEED_SWAP,
+	MTHP_STAT_COLLAPSE_EXCEED_NONE,
+	MTHP_STAT_COLLAPSE_EXCEED_SHARED,
 	__MTHP_STAT_COUNT
 };
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 20d005c2c61f..9f0470c3e983 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -639,6 +639,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
 DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
 DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
 DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
+DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
+
 
 static struct attribute *anon_stats_attrs[] = {
 	&anon_fault_alloc_attr.attr,
@@ -655,6 +659,9 @@ static struct attribute *anon_stats_attrs[] = {
 	&split_deferred_attr.attr,
 	&nr_anon_attr.attr,
 	&nr_anon_partially_mapped_attr.attr,
+	&collapse_exceed_swap_pte_attr.attr,
+	&collapse_exceed_none_pte_attr.attr,
+	&collapse_exceed_shared_pte_attr.attr,
 	NULL,
 };
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index c13bc583a368..5a3386043f39 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -594,7 +594,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 				continue;
 			} else {
 				result = SCAN_EXCEED_NONE_PTE;
-				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+				if (order == HPAGE_PMD_ORDER)
+					count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
 				goto out;
 			}
 		}
@@ -633,10 +635,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			 * shared may cause a future higher order collapse on a
 			 * rescan of the same range.
 			 */
-			if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
-			    shared > khugepaged_max_ptes_shared)) {
+			if (order != HPAGE_PMD_ORDER) {
+				result = SCAN_EXCEED_SHARED_PTE;
+				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
+				goto out;
+			}
+
+			if (cc->is_khugepaged &&
+			    shared > khugepaged_max_ptes_shared) {
 				result = SCAN_EXCEED_SHARED_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
+				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
 				goto out;
 			}
 		}
@@ -1084,6 +1093,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
 		 * range.
 		 */
 		if (order != HPAGE_PMD_ORDER) {
+			count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
 			pte_unmap(pte);
 			mmap_read_unlock(mm);
 			result = SCAN_EXCEED_SWAP_PTE;
-- 
2.50.1

Re: [PATCH v10 12/13] khugepaged: add per-order mTHP khugepaged stats

Posted by Lorenzo Stoakes 1 month, 1 week ago

On Tue, Aug 19, 2025 at 08:16:10AM -0600, Nico Pache wrote:
> With mTHP support inplace, let add the per-order mTHP stats for
> exceeding NONE, SWAP, and SHARED.
>

This is really not enough of a commit message. Exceeding what, where, why,
how? What does 'exceeding' mean here, etc. etc. More words please :)

> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  Documentation/admin-guide/mm/transhuge.rst | 17 +++++++++++++++++
>  include/linux/huge_mm.h                    |  3 +++
>  mm/huge_memory.c                           |  7 +++++++
>  mm/khugepaged.c                            | 16 +++++++++++++---
>  4 files changed, 40 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index 7ccb93e22852..b85547ac4fe9 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -705,6 +705,23 @@ nr_anon_partially_mapped
>         an anonymous THP as "partially mapped" and count it here, even though it
>         is not actually partially mapped anymore.
>
> +collapse_exceed_swap_pte
> +       The number of anonymous THP which contain at least one swap PTE.

The number of anonymous THP what? Pages? Let's be specific.

> +       Currently khugepaged does not support collapsing mTHP regions that
> +       contain a swap PTE.

Wait what? So we have a counter for something that's unsupported? That
seems not so useful?

> +
> +collapse_exceed_none_pte
> +       The number of anonymous THP which have exceeded the none PTE threshold.

THP pages. What's the 'none PTE threshold'? Do you mean
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none ?

Let's spell that out please, this is far too vague.

> +       With mTHP collapse, a bitmap is used to gather the state of a PMD region
> +       and is then recursively checked from largest to smallest order against
> +       the scaled max_ptes_none count. This counter indicates that the next
> +       enabled order will be checked.

I think you really need to expand upon this as this is confusing and vague.

I also don't think saying 'recursive' here really benefits anything, Just
saying that we try to collapse the largest mTHP size we can in each
instance, and then give a more 'words-y' explanation as to how
max_ptes_none is (in effect) converted to a ratio of a PMD, and then that
ratio is applied to the mTHP sizes.

You can then go on to say that this counter measures the number of
occasions in which this occurred.

> +
> +collapse_exceed_shared_pte
> +       The number of anonymous THP which contain at least one shared PTE.

anonymous THP pages right? :)

> +       Currently khugepaged does not support collapsing mTHP regions that
> +       contain a shared PTE.

Again I don't really understand the purpose of creating a counter for
something we don't support.

Let's add it when we support it.

I also in this case and the exceed swap case don't understand what you mean
by exceed here, you need to spell this out clearly.

Perhaps the context missing here is that you _also_ count THP events in
these counters.

But again, given we have THP_... counters for the stats mTHP doesn't do
yet, I'd say adding these is pointless.

> +
>  As the system ages, allocating huge pages may be expensive as the
>  system uses memory compaction to copy data around memory to free a
>  huge page for use. There are some counters in ``/proc/vmstat`` to help
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 4ada5d1f7297..6f1593d0b4b5 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -144,6 +144,9 @@ enum mthp_stat_item {
>  	MTHP_STAT_SPLIT_DEFERRED,
>  	MTHP_STAT_NR_ANON,
>  	MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
> +	MTHP_STAT_COLLAPSE_EXCEED_SWAP,
> +	MTHP_STAT_COLLAPSE_EXCEED_NONE,
> +	MTHP_STAT_COLLAPSE_EXCEED_SHARED,

Wh do we put 'collapse' here but not in the THP equivalents?

>  	__MTHP_STAT_COUNT
>  };
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 20d005c2c61f..9f0470c3e983 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -639,6 +639,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
>  DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
>  DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
>  DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> +DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> +
>
>  static struct attribute *anon_stats_attrs[] = {
>  	&anon_fault_alloc_attr.attr,
> @@ -655,6 +659,9 @@ static struct attribute *anon_stats_attrs[] = {
>  	&split_deferred_attr.attr,
>  	&nr_anon_attr.attr,
>  	&nr_anon_partially_mapped_attr.attr,
> +	&collapse_exceed_swap_pte_attr.attr,
> +	&collapse_exceed_none_pte_attr.attr,
> +	&collapse_exceed_shared_pte_attr.attr,
>  	NULL,
>  };
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index c13bc583a368..5a3386043f39 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -594,7 +594,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  				continue;
>  			} else {
>  				result = SCAN_EXCEED_NONE_PTE;
> -				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);

Hm so wait you were miscounting statistics in patch 10/13 when you turned
all this one? That's not good.

This should be in place _first_ before enabling the feature.

> +				if (order == HPAGE_PMD_ORDER)
> +					count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> +				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
>  				goto out;
>  			}
>  		}
> @@ -633,10 +635,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  			 * shared may cause a future higher order collapse on a
>  			 * rescan of the same range.
>  			 */
> -			if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
> -			    shared > khugepaged_max_ptes_shared)) {
> +			if (order != HPAGE_PMD_ORDER) {

Hm wait what? I dont understand what's going on here? You're no longer
actually doing any check except order != HPAGE_PMD_ORDER?... am I missnig
something?

Again why we are bothering to maintain a counter that doesn't mean anything
I don't know? I may be misinterpreting somehow however.

> +				result = SCAN_EXCEED_SHARED_PTE;
> +				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> +				goto out;
> +			}
> +
> +			if (cc->is_khugepaged &&
> +			    shared > khugepaged_max_ptes_shared) {
>  				result = SCAN_EXCEED_SHARED_PTE;
>  				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> +				count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
>  				goto out;
>  			}
>  		}
> @@ -1084,6 +1093,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
>  		 * range.
>  		 */
>  		if (order != HPAGE_PMD_ORDER) {
> +			count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);

This again seems surely to not be testing for what it claims to be
tracking? I may again be missing context here.

>  			pte_unmap(pte);
>  			mmap_read_unlock(mm);
>  			result = SCAN_EXCEED_SWAP_PTE;
> --
> 2.50.1
>

Re: [PATCH v10 12/13] khugepaged: add per-order mTHP khugepaged stats

Posted by Nico Pache 3 weeks, 4 days ago

On Thu, Aug 21, 2025 at 8:49 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Tue, Aug 19, 2025 at 08:16:10AM -0600, Nico Pache wrote:
> > With mTHP support inplace, let add the per-order mTHP stats for
> > exceeding NONE, SWAP, and SHARED.
> >
>
> This is really not enough of a commit message. Exceeding what, where, why,
> how? What does 'exceeding' mean here, etc. etc. More words please :)
Ok I will add more in the next version
>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> >  Documentation/admin-guide/mm/transhuge.rst | 17 +++++++++++++++++
> >  include/linux/huge_mm.h                    |  3 +++
> >  mm/huge_memory.c                           |  7 +++++++
> >  mm/khugepaged.c                            | 16 +++++++++++++---
> >  4 files changed, 40 insertions(+), 3 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> > index 7ccb93e22852..b85547ac4fe9 100644
> > --- a/Documentation/admin-guide/mm/transhuge.rst
> > +++ b/Documentation/admin-guide/mm/transhuge.rst
> > @@ -705,6 +705,23 @@ nr_anon_partially_mapped
> >         an anonymous THP as "partially mapped" and count it here, even though it
> >         is not actually partially mapped anymore.
> >
> > +collapse_exceed_swap_pte
> > +       The number of anonymous THP which contain at least one swap PTE.
>
> The number of anonymous THP what? Pages? Let's be specific.
ack
>
> > +       Currently khugepaged does not support collapsing mTHP regions that
> > +       contain a swap PTE.
>
> Wait what? So we have a counter for something that's unsupported? That
> seems not so useful?
The current implementation does not support swapped out or shared
pages. However these counters allow us to monitor when a mTHP collapse
fails due to exceeding the threshold (ie 0, hitting any swapped out or
shared page)
>
> > +
> > +collapse_exceed_none_pte
> > +       The number of anonymous THP which have exceeded the none PTE threshold.
>
> THP pages. What's the 'none PTE threshold'? Do you mean
> /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none ?
ack, I will expand these descriptions
>
> Let's spell that out please, this is far too vague.
>
> > +       With mTHP collapse, a bitmap is used to gather the state of a PMD region
> > +       and is then recursively checked from largest to smallest order against
> > +       the scaled max_ptes_none count. This counter indicates that the next
> > +       enabled order will be checked.
>
> I think you really need to expand upon this as this is confusing and vague.
>
> I also don't think saying 'recursive' here really benefits anything, Just
> saying that we try to collapse the largest mTHP size we can in each
> instance, and then give a more 'words-y' explanation as to how
> max_ptes_none is (in effect) converted to a ratio of a PMD, and then that
> ratio is applied to the mTHP sizes.
>
> You can then go on to say that this counter measures the number of
> occasions in which this occurred.
ack I will clean it up
>
> > +
> > +collapse_exceed_shared_pte
> > +       The number of anonymous THP which contain at least one shared PTE.
>
> anonymous THP pages right? :)
regions?
>
> > +       Currently khugepaged does not support collapsing mTHP regions that
> > +       contain a shared PTE.
>
> Again I don't really understand the purpose of creating a counter for
> something we don't support.
see above
>
> Let's add it when we support it.
>
> I also in this case and the exceed swap case don't understand what you mean
> by exceed here, you need to spell this out clearly.
>
> Perhaps the context missing here is that you _also_ count THP events in
> these counters.
>
> But again, given we have THP_... counters for the stats mTHP doesn't do
> yet, I'd say adding these is pointless.
>
> > +
> >  As the system ages, allocating huge pages may be expensive as the
> >  system uses memory compaction to copy data around memory to free a
> >  huge page for use. There are some counters in ``/proc/vmstat`` to help
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index 4ada5d1f7297..6f1593d0b4b5 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -144,6 +144,9 @@ enum mthp_stat_item {
> >       MTHP_STAT_SPLIT_DEFERRED,
> >       MTHP_STAT_NR_ANON,
> >       MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
> > +     MTHP_STAT_COLLAPSE_EXCEED_SWAP,
> > +     MTHP_STAT_COLLAPSE_EXCEED_NONE,
> > +     MTHP_STAT_COLLAPSE_EXCEED_SHARED,
>
> Wh do we put 'collapse' here but not in the THP equivalents?
to indicate they come from the collapse functionality. I can shorten
it by removing COLLAPSE if youd like
>
> >       __MTHP_STAT_COUNT
> >  };
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 20d005c2c61f..9f0470c3e983 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -639,6 +639,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
> >  DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
> >  DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
> >  DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
> > +DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> > +DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> > +DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> > +
> >
> >  static struct attribute *anon_stats_attrs[] = {
> >       &anon_fault_alloc_attr.attr,
> > @@ -655,6 +659,9 @@ static struct attribute *anon_stats_attrs[] = {
> >       &split_deferred_attr.attr,
> >       &nr_anon_attr.attr,
> >       &nr_anon_partially_mapped_attr.attr,
> > +     &collapse_exceed_swap_pte_attr.attr,
> > +     &collapse_exceed_none_pte_attr.attr,
> > +     &collapse_exceed_shared_pte_attr.attr,
> >       NULL,
> >  };
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index c13bc583a368..5a3386043f39 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -594,7 +594,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                               continue;
> >                       } else {
> >                               result = SCAN_EXCEED_NONE_PTE;
> > -                             count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
>
> Hm so wait you were miscounting statistics in patch 10/13 when you turned
> all this one? That's not good.
>
> This should be in place _first_ before enabling the feature.
Ok I can move them around.
>
> > +                             if (order == HPAGE_PMD_ORDER)
> > +                                     count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> > +                             count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> >                               goto out;
> >                       }
> >               }
> > @@ -633,10 +635,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                        * shared may cause a future higher order collapse on a
> >                        * rescan of the same range.
> >                        */
> > -                     if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
> > -                         shared > khugepaged_max_ptes_shared)) {
> > +                     if (order != HPAGE_PMD_ORDER) {
>
> Hm wait what? I dont understand what's going on here? You're no longer
> actually doing any check except order != HPAGE_PMD_ORDER?... am I missnig
> something?
>
> Again why we are bothering to maintain a counter that doesn't mean anything
> I don't know? I may be misinterpreting somehow however.
>
> > +                             result = SCAN_EXCEED_SHARED_PTE;
> > +                             count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> > +                             goto out;
> > +                     }
> > +
> > +                     if (cc->is_khugepaged &&
> > +                         shared > khugepaged_max_ptes_shared) {
> >                               result = SCAN_EXCEED_SHARED_PTE;
> >                               count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> > +                             count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> >                               goto out;
> >                       }
> >               }
> > @@ -1084,6 +1093,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
> >                * range.
> >                */
> >               if (order != HPAGE_PMD_ORDER) {
> > +                     count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
>
> This again seems surely to not be testing for what it claims to be
> tracking? I may again be missing context here.
We are bailing out of the mTHP collapse due to it having a SWAP page.
In turn exceeding our threshold of 0.

Cheers,
-- Nico
>
> >                       pte_unmap(pte);
> >                       mmap_read_unlock(mm);
> >                       result = SCAN_EXCEED_SWAP_PTE;
> > --
> > 2.50.1
> >
>

Re: [PATCH v10 12/13] khugepaged: add per-order mTHP khugepaged stats

Posted by Lorenzo Stoakes 3 weeks, 1 day ago

One small nitty note - be super helpful if you could add a newline before/after
your reply or soething like this as

>> blah blah blah
> blah
>> blah blah blah

Is harder to read than:

>> blah blah blah
>
> blah
>
>> blah blah blah

Thanks :)

On Tue, Sep 09, 2025 at 12:36:54AM -0600, Nico Pache wrote:
> On Thu, Aug 21, 2025 at 8:49 AM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> >
> > On Tue, Aug 19, 2025 at 08:16:10AM -0600, Nico Pache wrote:
> > > With mTHP support inplace, let add the per-order mTHP stats for
> > > exceeding NONE, SWAP, and SHARED.
> > >
> >
> > This is really not enough of a commit message. Exceeding what, where, why,
> > how? What does 'exceeding' mean here, etc. etc. More words please :)
> Ok I will add more in the next version

Thanks

> >
> > > Signed-off-by: Nico Pache <npache@redhat.com>
> > > ---
> > >  Documentation/admin-guide/mm/transhuge.rst | 17 +++++++++++++++++
> > >  include/linux/huge_mm.h                    |  3 +++
> > >  mm/huge_memory.c                           |  7 +++++++
> > >  mm/khugepaged.c                            | 16 +++++++++++++---
> > >  4 files changed, 40 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> > > index 7ccb93e22852..b85547ac4fe9 100644
> > > --- a/Documentation/admin-guide/mm/transhuge.rst
> > > +++ b/Documentation/admin-guide/mm/transhuge.rst
> > > @@ -705,6 +705,23 @@ nr_anon_partially_mapped
> > >         an anonymous THP as "partially mapped" and count it here, even though it
> > >         is not actually partially mapped anymore.
> > >
> > > +collapse_exceed_swap_pte
> > > +       The number of anonymous THP which contain at least one swap PTE.
> >
> > The number of anonymous THP what? Pages? Let's be specific.
> ack

Thanks

> >
> > > +       Currently khugepaged does not support collapsing mTHP regions that
> > > +       contain a swap PTE.
> >
> > Wait what? So we have a counter for something that's unsupported? That
> > seems not so useful?
> The current implementation does not support swapped out or shared
> pages. However these counters allow us to monitor when a mTHP collapse
> fails due to exceeding the threshold (ie 0, hitting any swapped out or
> shared page)

So the collapse counters are not measuring collapses? That seems a bit
confusing. Or actually is this implied in the 'exceed' bit? Because that'd make
sense actually.

But let's obviously document this carefully.


> >
> > > +
> > > +collapse_exceed_none_pte
> > > +       The number of anonymous THP which have exceeded the none PTE threshold.
> >
> > THP pages. What's the 'none PTE threshold'? Do you mean
> > /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none ?
> ack, I will expand these descriptions

Thanks.

> >
> > Let's spell that out please, this is far too vague.
> >
> > > +       With mTHP collapse, a bitmap is used to gather the state of a PMD region
> > > +       and is then recursively checked from largest to smallest order against
> > > +       the scaled max_ptes_none count. This counter indicates that the next
> > > +       enabled order will be checked.
> >
> > I think you really need to expand upon this as this is confusing and vague.
> >
> > I also don't think saying 'recursive' here really benefits anything, Just
> > saying that we try to collapse the largest mTHP size we can in each
> > instance, and then give a more 'words-y' explanation as to how
> > max_ptes_none is (in effect) converted to a ratio of a PMD, and then that
> > ratio is applied to the mTHP sizes.
> >
> > You can then go on to say that this counter measures the number of
> > occasions in which this occurred.
> ack I will clean it up

Thanks

> >
> > > +
> > > +collapse_exceed_shared_pte
> > > +       The number of anonymous THP which contain at least one shared PTE.
> >
> > anonymous THP pages right? :)
> regions?

I don't understand what regions would mean here?

So what are you actually measuring? The number of anonymous THP mappings? If so
I think 'mappings' is probably better. Or 'The number of anonymous THP page
table ranges...' perhaps?

> >
> > > +       Currently khugepaged does not support collapsing mTHP regions that
> > > +       contain a shared PTE.
> >
> > Again I don't really understand the purpose of creating a counter for
> > something we don't support.

> see above

Ack

> >
> > Let's add it when we support it.
> >
> > I also in this case and the exceed swap case don't understand what you mean
> > by exceed here, you need to spell this out clearly.
> >
> > Perhaps the context missing here is that you _also_ count THP events in
> > these counters.
> >
> > But again, given we have THP_... counters for the stats mTHP doesn't do
> > yet, I'd say adding these is pointless.
> >
> > > +
> > >  As the system ages, allocating huge pages may be expensive as the
> > >  system uses memory compaction to copy data around memory to free a
> > >  huge page for use. There are some counters in ``/proc/vmstat`` to help
> > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > > index 4ada5d1f7297..6f1593d0b4b5 100644
> > > --- a/include/linux/huge_mm.h
> > > +++ b/include/linux/huge_mm.h
> > > @@ -144,6 +144,9 @@ enum mthp_stat_item {
> > >       MTHP_STAT_SPLIT_DEFERRED,
> > >       MTHP_STAT_NR_ANON,
> > >       MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
> > > +     MTHP_STAT_COLLAPSE_EXCEED_SWAP,
> > > +     MTHP_STAT_COLLAPSE_EXCEED_NONE,
> > > +     MTHP_STAT_COLLAPSE_EXCEED_SHARED,
> >
> > Wh do we put 'collapse' here but not in the THP equivalents?

> to indicate they come from the collapse functionality. I can shorten
> it by removing COLLAPSE if youd like

Hmm, if this is actually giving information then fine to keep.

> >
> > >       __MTHP_STAT_COUNT
> > >  };
> > >
> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > index 20d005c2c61f..9f0470c3e983 100644
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -639,6 +639,10 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
> > >  DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
> > >  DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
> > >  DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
> > > +DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> > > +DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> > > +DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> > > +
> > >
> > >  static struct attribute *anon_stats_attrs[] = {
> > >       &anon_fault_alloc_attr.attr,
> > > @@ -655,6 +659,9 @@ static struct attribute *anon_stats_attrs[] = {
> > >       &split_deferred_attr.attr,
> > >       &nr_anon_attr.attr,
> > >       &nr_anon_partially_mapped_attr.attr,
> > > +     &collapse_exceed_swap_pte_attr.attr,
> > > +     &collapse_exceed_none_pte_attr.attr,
> > > +     &collapse_exceed_shared_pte_attr.attr,
> > >       NULL,
> > >  };
> > >
> > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > > index c13bc583a368..5a3386043f39 100644
> > > --- a/mm/khugepaged.c
> > > +++ b/mm/khugepaged.c
> > > @@ -594,7 +594,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > >                               continue;
> > >                       } else {
> > >                               result = SCAN_EXCEED_NONE_PTE;
> > > -                             count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> >
> > Hm so wait you were miscounting statistics in patch 10/13 when you turned
> > all this one? That's not good.
> >
> > This should be in place _first_ before enabling the feature.
> Ok I can move them around.

Thanks

> >
> > > +                             if (order == HPAGE_PMD_ORDER)
> > > +                                     count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> > > +                             count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE);
> > >                               goto out;
> > >                       }
> > >               }
> > > @@ -633,10 +635,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> > >                        * shared may cause a future higher order collapse on a
> > >                        * rescan of the same range.
> > >                        */
> > > -                     if (order != HPAGE_PMD_ORDER || (cc->is_khugepaged &&
> > > -                         shared > khugepaged_max_ptes_shared)) {
> > > +                     if (order != HPAGE_PMD_ORDER) {
> >
> > Hm wait what? I dont understand what's going on here? You're no longer
> > actually doing any check except order != HPAGE_PMD_ORDER?... am I missnig
> > something?
> >
> > Again why we are bothering to maintain a counter that doesn't mean anything
> > I don't know? I may be misinterpreting somehow however.

I guess answered by rest.

> >
> > > +                             result = SCAN_EXCEED_SHARED_PTE;
> > > +                             count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> > > +                             goto out;
> > > +                     }
> > > +
> > > +                     if (cc->is_khugepaged &&
> > > +                         shared > khugepaged_max_ptes_shared) {
> > >                               result = SCAN_EXCEED_SHARED_PTE;
> > >                               count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
> > > +                             count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED);
> > >                               goto out;
> > >                       }
> > >               }
> > > @@ -1084,6 +1093,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
> > >                * range.
> > >                */
> > >               if (order != HPAGE_PMD_ORDER) {
> > > +                     count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP);
> >
> > This again seems surely to not be testing for what it claims to be
> > tracking? I may again be missing context here.
> We are bailing out of the mTHP collapse due to it having a SWAP page.
> In turn exceeding our threshold of 0.

OK.

>
> Cheers,
> -- Nico
> >
> > >                       pte_unmap(pte);
> > >                       mmap_read_unlock(mm);
> > >                       result = SCAN_EXCEED_SWAP_PTE;
> > > --
> > > 2.50.1
> > >
> >
>