[PATCH v5 1/4] mm: defer THP insertion to khugepaged

Nico Pache posted 4 patches 7 months, 3 weeks ago
There is a newer version of this series
[PATCH v5 1/4] mm: defer THP insertion to khugepaged
Posted by Nico Pache 7 months, 3 weeks ago
setting /transparent_hugepages/enabled=always allows applications
to benefit from THPs without having to madvise. However, the pf handler
takes very few considerations to decide weather or not to actually use a
THP. This can lead to a lot of wasted memory. khugepaged only operates
on memory that was either allocated with enabled=always or MADV_HUGEPAGE.

Introduce the ability to set enabled=defer, which will prevent THPs from
being allocated by the page fault handler unless madvise is set,
leaving it up to khugepaged to decide which allocations will collapse to a
THP. This should allow applications to benefits from THPs, while curbing
some of the memory waste.

Co-developed-by: Rafael Aquini <raquini@redhat.com>
Signed-off-by: Rafael Aquini <raquini@redhat.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
 include/linux/huge_mm.h | 15 +++++++++++++--
 mm/huge_memory.c        | 31 +++++++++++++++++++++++++++----
 2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e3d15c737008..57e6c962afb1 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -48,6 +48,7 @@ enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_UNSUPPORTED,
 	TRANSPARENT_HUGEPAGE_FLAG,
 	TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+	TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
@@ -186,6 +187,7 @@ static inline bool hugepage_global_enabled(void)
 {
 	return transparent_hugepage_flags &
 			((1<<TRANSPARENT_HUGEPAGE_FLAG) |
+			(1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG) |
 			(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG));
 }
 
@@ -195,6 +197,12 @@ static inline bool hugepage_global_always(void)
 			(1<<TRANSPARENT_HUGEPAGE_FLAG);
 }
 
+static inline bool hugepage_global_defer(void)
+{
+	return transparent_hugepage_flags &
+			(1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG);
+}
+
 static inline int highest_order(unsigned long orders)
 {
 	return fls_long(orders) - 1;
@@ -291,13 +299,16 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
 				       unsigned long tva_flags,
 				       unsigned long orders)
 {
+	if ((tva_flags & TVA_IN_PF) && hugepage_global_defer() &&
+			!(vm_flags & VM_HUGEPAGE))
+		return 0;
+
 	/* Optimization to check if required orders are enabled early. */
 	if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
 		unsigned long mask = READ_ONCE(huge_anon_orders_always);
-
 		if (vm_flags & VM_HUGEPAGE)
 			mask |= READ_ONCE(huge_anon_orders_madvise);
-		if (hugepage_global_always() ||
+		if (hugepage_global_always() || hugepage_global_defer() ||
 		    ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
 			mask |= READ_ONCE(huge_anon_orders_inherit);
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8af5caa0d9bc..17b66adef029 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -297,12 +297,15 @@ static ssize_t enabled_show(struct kobject *kobj,
 	const char *output;
 
 	if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
-		output = "[always] madvise never";
+		output = "[always] madvise defer never";
 	else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 			  &transparent_hugepage_flags))
-		output = "always [madvise] never";
+		output = "always [madvise] defer never";
+	else if (test_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
+			  &transparent_hugepage_flags))
+		output = "always madvise [defer] never";
 	else
-		output = "always madvise [never]";
+		output = "always madvise defer [never]";
 
 	return sysfs_emit(buf, "%s\n", output);
 }
@@ -315,13 +318,20 @@ static ssize_t enabled_store(struct kobject *kobj,
 
 	if (sysfs_streq(buf, "always")) {
 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG, &transparent_hugepage_flags);
 		set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+	} else if (sysfs_streq(buf, "defer")) {
+		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+		set_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG, &transparent_hugepage_flags);
 	} else if (sysfs_streq(buf, "madvise")) {
 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG, &transparent_hugepage_flags);
 		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
 	} else if (sysfs_streq(buf, "never")) {
 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG, &transparent_hugepage_flags);
 	} else
 		ret = -EINVAL;
 
@@ -954,18 +964,31 @@ static int __init setup_transparent_hugepage(char *str)
 			&transparent_hugepage_flags);
 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 			  &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
+			  &transparent_hugepage_flags);
 		ret = 1;
+	} else if (!strcmp(str, "defer")) {
+		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+			  &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+			  &transparent_hugepage_flags);
+		set_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
+			  &transparent_hugepage_flags);
 	} else if (!strcmp(str, "madvise")) {
 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
 			  &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
+			  &transparent_hugepage_flags);
 		set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
-			&transparent_hugepage_flags);
+			  &transparent_hugepage_flags);
 		ret = 1;
 	} else if (!strcmp(str, "never")) {
 		clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
 			  &transparent_hugepage_flags);
 		clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 			  &transparent_hugepage_flags);
+		clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
+			  &transparent_hugepage_flags);
 		ret = 1;
 	}
 out:
-- 
2.48.1
Re: [PATCH v5 1/4] mm: defer THP insertion to khugepaged
Posted by Zi Yan 7 months, 3 weeks ago
On 28 Apr 2025, at 14:29, Nico Pache wrote:

> setting /transparent_hugepages/enabled=always allows applications
> to benefit from THPs without having to madvise. However, the pf handler

s/pf/page fault

> takes very few considerations to decide weather or not to actually use a

s/weather/whether

> THP. This can lead to a lot of wasted memory. khugepaged only operates
> on memory that was either allocated with enabled=always or MADV_HUGEPAGE.
>
> Introduce the ability to set enabled=defer, which will prevent THPs from
> being allocated by the page fault handler unless madvise is set,
> leaving it up to khugepaged to decide which allocations will collapse to a
> THP. This should allow applications to benefits from THPs, while curbing
> some of the memory waste.
>
> Co-developed-by: Rafael Aquini <raquini@redhat.com>
> Signed-off-by: Rafael Aquini <raquini@redhat.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  include/linux/huge_mm.h | 15 +++++++++++++--
>  mm/huge_memory.c        | 31 +++++++++++++++++++++++++++----
>  2 files changed, 40 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index e3d15c737008..57e6c962afb1 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -48,6 +48,7 @@ enum transparent_hugepage_flag {
>  	TRANSPARENT_HUGEPAGE_UNSUPPORTED,
>  	TRANSPARENT_HUGEPAGE_FLAG,
>  	TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
> +	TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,

What does INST mean here? Can you add one sentence on this new flag
in the commit log to explain what it is short for?


>  	TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
>  	TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
>  	TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
> @@ -186,6 +187,7 @@ static inline bool hugepage_global_enabled(void)
>  {
>  	return transparent_hugepage_flags &
>  			((1<<TRANSPARENT_HUGEPAGE_FLAG) |
> +			(1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG) |
>  			(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG));
>  }
>
> @@ -195,6 +197,12 @@ static inline bool hugepage_global_always(void)
>  			(1<<TRANSPARENT_HUGEPAGE_FLAG);
>  }
>
> +static inline bool hugepage_global_defer(void)
> +{
> +	return transparent_hugepage_flags &
> +			(1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG);
> +}
> +
>  static inline int highest_order(unsigned long orders)
>  {
>  	return fls_long(orders) - 1;
> @@ -291,13 +299,16 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>  				       unsigned long tva_flags,
>  				       unsigned long orders)
>  {
> +	if ((tva_flags & TVA_IN_PF) && hugepage_global_defer() &&
> +			!(vm_flags & VM_HUGEPAGE))
> +		return 0;
> +
>  	/* Optimization to check if required orders are enabled early. */
>  	if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
>  		unsigned long mask = READ_ONCE(huge_anon_orders_always);
> -

This newline should stay, right?

The rest looks good to me. Thanks. Acked-by: Zi Yan <ziy@nvidia.com>

Best Regards,
Yan, Zi
Re: [PATCH v5 1/4] mm: defer THP insertion to khugepaged
Posted by Nico Pache 7 months, 3 weeks ago
On Tue, Apr 29, 2025 at 7:49 AM Zi Yan <ziy@nvidia.com> wrote:
>
> On 28 Apr 2025, at 14:29, Nico Pache wrote:
>
> > setting /transparent_hugepages/enabled=always allows applications
> > to benefit from THPs without having to madvise. However, the pf handler
>
> s/pf/page fault
>
> > takes very few considerations to decide weather or not to actually use a
>
> s/weather/whether
>
> > THP. This can lead to a lot of wasted memory. khugepaged only operates
> > on memory that was either allocated with enabled=always or MADV_HUGEPAGE.
> >
> > Introduce the ability to set enabled=defer, which will prevent THPs from
> > being allocated by the page fault handler unless madvise is set,
> > leaving it up to khugepaged to decide which allocations will collapse to a
> > THP. This should allow applications to benefits from THPs, while curbing
> > some of the memory waste.
> >
> > Co-developed-by: Rafael Aquini <raquini@redhat.com>
> > Signed-off-by: Rafael Aquini <raquini@redhat.com>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> >  include/linux/huge_mm.h | 15 +++++++++++++--
> >  mm/huge_memory.c        | 31 +++++++++++++++++++++++++++----
> >  2 files changed, 40 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index e3d15c737008..57e6c962afb1 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -48,6 +48,7 @@ enum transparent_hugepage_flag {
> >       TRANSPARENT_HUGEPAGE_UNSUPPORTED,
> >       TRANSPARENT_HUGEPAGE_FLAG,
> >       TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
> > +     TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
>
> What does INST mean here? Can you add one sentence on this new flag
> in the commit log to explain what it is short for?
"INSERT". Someone else commented on the length of this FLAG name. I
forgot to update it.
I can shorten it to something like ..DEFER_FLAG or DEFER_PF_FLAG
>
>
> >       TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
> >       TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
> >       TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
> > @@ -186,6 +187,7 @@ static inline bool hugepage_global_enabled(void)
> >  {
> >       return transparent_hugepage_flags &
> >                       ((1<<TRANSPARENT_HUGEPAGE_FLAG) |
> > +                     (1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG) |
> >                       (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG));
> >  }
> >
> > @@ -195,6 +197,12 @@ static inline bool hugepage_global_always(void)
> >                       (1<<TRANSPARENT_HUGEPAGE_FLAG);
> >  }
> >
> > +static inline bool hugepage_global_defer(void)
> > +{
> > +     return transparent_hugepage_flags &
> > +                     (1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG);
> > +}
> > +
> >  static inline int highest_order(unsigned long orders)
> >  {
> >       return fls_long(orders) - 1;
> > @@ -291,13 +299,16 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> >                                      unsigned long tva_flags,
> >                                      unsigned long orders)
> >  {
> > +     if ((tva_flags & TVA_IN_PF) && hugepage_global_defer() &&
> > +                     !(vm_flags & VM_HUGEPAGE))
> > +             return 0;
> > +
> >       /* Optimization to check if required orders are enabled early. */
> >       if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
> >               unsigned long mask = READ_ONCE(huge_anon_orders_always);
> > -
>
> This newline should stay, right?
Yes, I can fix that.
>
> The rest looks good to me. Thanks. Acked-by: Zi Yan <ziy@nvidia.com>
Thank you!
-- Nico
>
> Best Regards,
> Yan, Zi
>