setting /transparent_hugepages/enabled=always allows applications
to benefit from THPs without having to madvise. However, the pf handler
takes very few considerations to decide weather or not to actually use a
THP. This can lead to a lot of wasted memory. khugepaged only operates
on memory that was either allocated with enabled=always or MADV_HUGEPAGE.
Introduce the ability to set enabled=defer, which will prevent THPs from
being allocated by the page fault handler unless madvise is set,
leaving it up to khugepaged to decide which allocations will collapse to a
THP. This should allow applications to benefits from THPs, while curbing
some of the memory waste.
Co-developed-by: Rafael Aquini <raquini@redhat.com>
Signed-off-by: Rafael Aquini <raquini@redhat.com>
Signed-off-by: Nico Pache <npache@redhat.com>
---
include/linux/huge_mm.h | 15 +++++++++++++--
mm/huge_memory.c | 31 +++++++++++++++++++++++++++----
2 files changed, 40 insertions(+), 6 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e3d15c737008..57e6c962afb1 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -48,6 +48,7 @@ enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_UNSUPPORTED,
TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
@@ -186,6 +187,7 @@ static inline bool hugepage_global_enabled(void)
{
return transparent_hugepage_flags &
((1<<TRANSPARENT_HUGEPAGE_FLAG) |
+ (1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG) |
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG));
}
@@ -195,6 +197,12 @@ static inline bool hugepage_global_always(void)
(1<<TRANSPARENT_HUGEPAGE_FLAG);
}
+static inline bool hugepage_global_defer(void)
+{
+ return transparent_hugepage_flags &
+ (1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG);
+}
+
static inline int highest_order(unsigned long orders)
{
return fls_long(orders) - 1;
@@ -291,13 +299,16 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
unsigned long tva_flags,
unsigned long orders)
{
+ if ((tva_flags & TVA_IN_PF) && hugepage_global_defer() &&
+ !(vm_flags & VM_HUGEPAGE))
+ return 0;
+
/* Optimization to check if required orders are enabled early. */
if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
unsigned long mask = READ_ONCE(huge_anon_orders_always);
-
if (vm_flags & VM_HUGEPAGE)
mask |= READ_ONCE(huge_anon_orders_madvise);
- if (hugepage_global_always() ||
+ if (hugepage_global_always() || hugepage_global_defer() ||
((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
mask |= READ_ONCE(huge_anon_orders_inherit);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8af5caa0d9bc..17b66adef029 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -297,12 +297,15 @@ static ssize_t enabled_show(struct kobject *kobj,
const char *output;
if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
- output = "[always] madvise never";
+ output = "[always] madvise defer never";
else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
&transparent_hugepage_flags))
- output = "always [madvise] never";
+ output = "always [madvise] defer never";
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
+ &transparent_hugepage_flags))
+ output = "always madvise [defer] never";
else
- output = "always madvise [never]";
+ output = "always madvise defer [never]";
return sysfs_emit(buf, "%s\n", output);
}
@@ -315,13 +318,20 @@ static ssize_t enabled_store(struct kobject *kobj,
if (sysfs_streq(buf, "always")) {
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ } else if (sysfs_streq(buf, "defer")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG, &transparent_hugepage_flags);
} else if (sysfs_streq(buf, "madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
} else if (sysfs_streq(buf, "never")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG, &transparent_hugepage_flags);
} else
ret = -EINVAL;
@@ -954,18 +964,31 @@ static int __init setup_transparent_hugepage(char *str)
&transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
&transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
+ &transparent_hugepage_flags);
ret = 1;
+ } else if (!strcmp(str, "defer")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
+ &transparent_hugepage_flags);
} else if (!strcmp(str, "madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
&transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
+ &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
- &transparent_hugepage_flags);
+ &transparent_hugepage_flags);
ret = 1;
} else if (!strcmp(str, "never")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
&transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
&transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
+ &transparent_hugepage_flags);
ret = 1;
}
out:
--
2.48.1
On 28 Apr 2025, at 14:29, Nico Pache wrote:
> setting /transparent_hugepages/enabled=always allows applications
> to benefit from THPs without having to madvise. However, the pf handler
s/pf/page fault
> takes very few considerations to decide weather or not to actually use a
s/weather/whether
> THP. This can lead to a lot of wasted memory. khugepaged only operates
> on memory that was either allocated with enabled=always or MADV_HUGEPAGE.
>
> Introduce the ability to set enabled=defer, which will prevent THPs from
> being allocated by the page fault handler unless madvise is set,
> leaving it up to khugepaged to decide which allocations will collapse to a
> THP. This should allow applications to benefits from THPs, while curbing
> some of the memory waste.
>
> Co-developed-by: Rafael Aquini <raquini@redhat.com>
> Signed-off-by: Rafael Aquini <raquini@redhat.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> include/linux/huge_mm.h | 15 +++++++++++++--
> mm/huge_memory.c | 31 +++++++++++++++++++++++++++----
> 2 files changed, 40 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index e3d15c737008..57e6c962afb1 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -48,6 +48,7 @@ enum transparent_hugepage_flag {
> TRANSPARENT_HUGEPAGE_UNSUPPORTED,
> TRANSPARENT_HUGEPAGE_FLAG,
> TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
> + TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
What does INST mean here? Can you add one sentence on this new flag
in the commit log to explain what it is short for?
> TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
> TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
> TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
> @@ -186,6 +187,7 @@ static inline bool hugepage_global_enabled(void)
> {
> return transparent_hugepage_flags &
> ((1<<TRANSPARENT_HUGEPAGE_FLAG) |
> + (1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG) |
> (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG));
> }
>
> @@ -195,6 +197,12 @@ static inline bool hugepage_global_always(void)
> (1<<TRANSPARENT_HUGEPAGE_FLAG);
> }
>
> +static inline bool hugepage_global_defer(void)
> +{
> + return transparent_hugepage_flags &
> + (1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG);
> +}
> +
> static inline int highest_order(unsigned long orders)
> {
> return fls_long(orders) - 1;
> @@ -291,13 +299,16 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> unsigned long tva_flags,
> unsigned long orders)
> {
> + if ((tva_flags & TVA_IN_PF) && hugepage_global_defer() &&
> + !(vm_flags & VM_HUGEPAGE))
> + return 0;
> +
> /* Optimization to check if required orders are enabled early. */
> if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
> unsigned long mask = READ_ONCE(huge_anon_orders_always);
> -
This newline should stay, right?
The rest looks good to me. Thanks. Acked-by: Zi Yan <ziy@nvidia.com>
Best Regards,
Yan, Zi
On Tue, Apr 29, 2025 at 7:49 AM Zi Yan <ziy@nvidia.com> wrote:
>
> On 28 Apr 2025, at 14:29, Nico Pache wrote:
>
> > setting /transparent_hugepages/enabled=always allows applications
> > to benefit from THPs without having to madvise. However, the pf handler
>
> s/pf/page fault
>
> > takes very few considerations to decide weather or not to actually use a
>
> s/weather/whether
>
> > THP. This can lead to a lot of wasted memory. khugepaged only operates
> > on memory that was either allocated with enabled=always or MADV_HUGEPAGE.
> >
> > Introduce the ability to set enabled=defer, which will prevent THPs from
> > being allocated by the page fault handler unless madvise is set,
> > leaving it up to khugepaged to decide which allocations will collapse to a
> > THP. This should allow applications to benefits from THPs, while curbing
> > some of the memory waste.
> >
> > Co-developed-by: Rafael Aquini <raquini@redhat.com>
> > Signed-off-by: Rafael Aquini <raquini@redhat.com>
> > Signed-off-by: Nico Pache <npache@redhat.com>
> > ---
> > include/linux/huge_mm.h | 15 +++++++++++++--
> > mm/huge_memory.c | 31 +++++++++++++++++++++++++++----
> > 2 files changed, 40 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> > index e3d15c737008..57e6c962afb1 100644
> > --- a/include/linux/huge_mm.h
> > +++ b/include/linux/huge_mm.h
> > @@ -48,6 +48,7 @@ enum transparent_hugepage_flag {
> > TRANSPARENT_HUGEPAGE_UNSUPPORTED,
> > TRANSPARENT_HUGEPAGE_FLAG,
> > TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
> > + TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG,
>
> What does INST mean here? Can you add one sentence on this new flag
> in the commit log to explain what it is short for?
"INSERT". Someone else commented on the length of this FLAG name. I
forgot to update it.
I can shorten it to something like ..DEFER_FLAG or DEFER_PF_FLAG
>
>
> > TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
> > TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
> > TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
> > @@ -186,6 +187,7 @@ static inline bool hugepage_global_enabled(void)
> > {
> > return transparent_hugepage_flags &
> > ((1<<TRANSPARENT_HUGEPAGE_FLAG) |
> > + (1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG) |
> > (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG));
> > }
> >
> > @@ -195,6 +197,12 @@ static inline bool hugepage_global_always(void)
> > (1<<TRANSPARENT_HUGEPAGE_FLAG);
> > }
> >
> > +static inline bool hugepage_global_defer(void)
> > +{
> > + return transparent_hugepage_flags &
> > + (1<<TRANSPARENT_HUGEPAGE_DEFER_PF_INST_FLAG);
> > +}
> > +
> > static inline int highest_order(unsigned long orders)
> > {
> > return fls_long(orders) - 1;
> > @@ -291,13 +299,16 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
> > unsigned long tva_flags,
> > unsigned long orders)
> > {
> > + if ((tva_flags & TVA_IN_PF) && hugepage_global_defer() &&
> > + !(vm_flags & VM_HUGEPAGE))
> > + return 0;
> > +
> > /* Optimization to check if required orders are enabled early. */
> > if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
> > unsigned long mask = READ_ONCE(huge_anon_orders_always);
> > -
>
> This newline should stay, right?
Yes, I can fix that.
>
> The rest looks good to me. Thanks. Acked-by: Zi Yan <ziy@nvidia.com>
Thank you!
-- Nico
>
> Best Regards,
> Yan, Zi
>
© 2016 - 2025 Red Hat, Inc.