From: David Hildenbrand <david@redhat.com>
Let's allow for making MADV_COLLAPSE succeed on areas that neither have
VM_HUGEPAGE nor VM_NOHUGEPAGE when we have THP disabled
unless explicitly advised (PR_THP_DISABLE_EXCEPT_ADVISED).
MADV_COLLAPSE is a clear advise that we want to collapse.
Note that we still respect the VM_NOHUGEPAGE flag, just like
MADV_COLLAPSE always does. So consequently, MADV_COLLAPSE is now only
refused on VM_NOHUGEPAGE with PR_THP_DISABLE_EXCEPT_ADVISED.
Co-developed-by: Usama Arif <usamaarif642@gmail.com>
Signed-off-by: Usama Arif <usamaarif642@gmail.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
include/linux/huge_mm.h | 8 +++++++-
include/uapi/linux/prctl.h | 2 +-
mm/huge_memory.c | 5 +++--
mm/memory.c | 6 ++++--
mm/shmem.c | 2 +-
5 files changed, 16 insertions(+), 7 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b0ff54eee81c..aeaf93f8ac2e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -329,7 +329,7 @@ struct thpsize {
* through madvise or prctl.
*/
static inline bool vma_thp_disabled(struct vm_area_struct *vma,
- vm_flags_t vm_flags)
+ vm_flags_t vm_flags, bool forced_collapse)
{
/* Are THPs disabled for this VMA? */
if (vm_flags & VM_NOHUGEPAGE)
@@ -343,6 +343,12 @@ static inline bool vma_thp_disabled(struct vm_area_struct *vma,
*/
if (vm_flags & VM_HUGEPAGE)
return false;
+ /*
+ * Forcing a collapse (e.g., madv_collapse), is a clear advise to
+ * use THPs.
+ */
+ if (forced_collapse)
+ return false;
return test_bit(MMF_DISABLE_THP_EXCEPT_ADVISED, &vma->vm_mm->flags);
}
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 9c1d6e49b8a9..ee4165738779 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -185,7 +185,7 @@ struct prctl_mm_map {
#define PR_SET_THP_DISABLE 41
/*
* Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE /
- * VM_HUGEPAGE).
+ * VM_HUGEPAGE / MADV_COLLAPSE).
*/
# define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1)
#define PR_GET_THP_DISABLE 42
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 85252b468f80..ef5ccb0ec5d5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -104,7 +104,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
{
const bool smaps = type == TVA_SMAPS;
const bool in_pf = type == TVA_PAGEFAULT;
- const bool enforce_sysfs = type != TVA_FORCED_COLLAPSE;
+ const bool forced_collapse = type == TVA_FORCED_COLLAPSE;
+ const bool enforce_sysfs = !forced_collapse;
unsigned long supported_orders;
/* Check the intersection of requested and supported orders. */
@@ -122,7 +123,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
if (!vma->vm_mm) /* vdso */
return 0;
- if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse))
return 0;
/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
diff --git a/mm/memory.c b/mm/memory.c
index be761753f240..bd04212d6f79 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5186,9 +5186,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa
* It is too late to allocate a small folio, we already have a large
* folio in the pagecache: especially s390 KVM cannot tolerate any
* PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any
- * PMD mappings if THPs are disabled.
+ * PMD mappings if THPs are disabled. As we already have a THP ...
+ * behave as if we are forcing a collapse.
*/
- if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags,
+ /* forced_collapse=*/ true))
return ret;
if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
diff --git a/mm/shmem.c b/mm/shmem.c
index e6cdfda08aed..30609197a266 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1816,7 +1816,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
vm_flags_t vm_flags = vma ? vma->vm_flags : 0;
unsigned int global_orders;
- if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
+ if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force)))
return 0;
global_orders = shmem_huge_global_enabled(inode, index, write_end,
--
2.47.3
Nits on subject: - It's >75 chars - advise is the verb, advice is the noun. On Thu, Jul 31, 2025 at 01:27:20PM +0100, Usama Arif wrote: > From: David Hildenbrand <david@redhat.com> > > Let's allow for making MADV_COLLAPSE succeed on areas that neither have > VM_HUGEPAGE nor VM_NOHUGEPAGE when we have THP disabled > unless explicitly advised (PR_THP_DISABLE_EXCEPT_ADVISED). Hmm, I'm not sure about this. So far this prctl() has been the only way to override MADV_COLLAPSE behaviour, but now we're allowing for this one case to not. I suppose the precedent is that MADV_COLLAPSE overrides 'madvise' sysfs behaviour. I suppose what saves us here is 'advised' can be read to mean either MADV_HUGEPAGE or MADV_COLLAPSE. And yes, MADV_COLLAPSE is clearly the user requesting this behaviour. I think the vagueness here is one that already existed, because one could perfectly one have expected MADV_COLLAPSE to obey sysfs and require MADV_HUGEPAGE to have been applied, but of course this is not the case. OK so fine. BUT. I think the MADV_COLLAPSE man page will need to be updated to mention this. And I REALLY think we should update the THP doc too to mention all these prctl() modes. I'm not sure we cover that right now _at all_ and obviously we should describe the new flags. Usama - can you add a patch to this series to do that? > > MADV_COLLAPSE is a clear advise that we want to collapse. advise -> advice. > > Note that we still respect the VM_NOHUGEPAGE flag, just like > MADV_COLLAPSE always does. So consequently, MADV_COLLAPSE is now only > refused on VM_NOHUGEPAGE with PR_THP_DISABLE_EXCEPT_ADVISED. You also need to mention the shmem change you've made I think. > > Co-developed-by: Usama Arif <usamaarif642@gmail.com> > Signed-off-by: Usama Arif <usamaarif642@gmail.com> > Signed-off-by: David Hildenbrand <david@redhat.com> > --- > include/linux/huge_mm.h | 8 +++++++- > include/uapi/linux/prctl.h | 2 +- > mm/huge_memory.c | 5 +++-- > mm/memory.c | 6 ++++-- > mm/shmem.c | 2 +- > 5 files changed, 16 insertions(+), 7 deletions(-) > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index b0ff54eee81c..aeaf93f8ac2e 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -329,7 +329,7 @@ struct thpsize { > * through madvise or prctl. > */ > static inline bool vma_thp_disabled(struct vm_area_struct *vma, > - vm_flags_t vm_flags) > + vm_flags_t vm_flags, bool forced_collapse) > { > /* Are THPs disabled for this VMA? */ > if (vm_flags & VM_NOHUGEPAGE) > @@ -343,6 +343,12 @@ static inline bool vma_thp_disabled(struct vm_area_struct *vma, > */ > if (vm_flags & VM_HUGEPAGE) > return false; > + /* > + * Forcing a collapse (e.g., madv_collapse), is a clear advise to advise -> advice. > + * use THPs. > + */ > + if (forced_collapse) > + return false; > return test_bit(MMF_DISABLE_THP_EXCEPT_ADVISED, &vma->vm_mm->flags); > } > > diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h > index 9c1d6e49b8a9..ee4165738779 100644 > --- a/include/uapi/linux/prctl.h > +++ b/include/uapi/linux/prctl.h > @@ -185,7 +185,7 @@ struct prctl_mm_map { > #define PR_SET_THP_DISABLE 41 > /* > * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE / > - * VM_HUGEPAGE). > + * VM_HUGEPAGE / MADV_COLLAPSE). This is confusing you're mixing VMA flags with MADV ones... maybe just stick to madvise ones, or add extra context around VM_HUGEPAGE bit? Would need to be fixed up in a prior commit obviously. > */ > # define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) > #define PR_GET_THP_DISABLE 42 > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 85252b468f80..ef5ccb0ec5d5 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -104,7 +104,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, > { > const bool smaps = type == TVA_SMAPS; > const bool in_pf = type == TVA_PAGEFAULT; > - const bool enforce_sysfs = type != TVA_FORCED_COLLAPSE; > + const bool forced_collapse = type == TVA_FORCED_COLLAPSE; > + const bool enforce_sysfs = !forced_collapse; Can we just get rid of this enforce_sysfs altogether in patch 2/5 and use forced_collapse? The first place we use it we negate it: return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file), vma, vma->vm_pgoff, 0, !enforce_sysfs); And the one other place we'd have to negate, but it actually helps document behaviour I think. > unsigned long supported_orders; > > /* Check the intersection of requested and supported orders. */ > @@ -122,7 +123,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, > if (!vma->vm_mm) /* vdso */ > return 0; > > - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) > + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse)) > return 0; > > /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ > diff --git a/mm/memory.c b/mm/memory.c > index be761753f240..bd04212d6f79 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -5186,9 +5186,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa > * It is too late to allocate a small folio, we already have a large > * folio in the pagecache: especially s390 KVM cannot tolerate any > * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any > - * PMD mappings if THPs are disabled. > + * PMD mappings if THPs are disabled. As we already have a THP ... > + * behave as if we are forcing a collapse. > */ > - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) > + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags, > + /* forced_collapse=*/ true)) > return ret; > > if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) > diff --git a/mm/shmem.c b/mm/shmem.c > index e6cdfda08aed..30609197a266 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -1816,7 +1816,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, > vm_flags_t vm_flags = vma ? vma->vm_flags : 0; > unsigned int global_orders; > > - if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) > + if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force))) Hm this is an extra bit of logic, as noted above, we definitely need to mention we're changing this in the commit message also. Since shmem_allowable_huge_orders() can only be invoked by __thp_vma_allowable_orders() with shmem_huge_force set true, and it'd only be the case should TVA_FORCED_COLLAPSE is set, this makes sense. > return 0; > > global_orders = shmem_huge_global_enabled(inode, index, write_end, > -- > 2.47.3 >
On 31.07.25 16:38, Lorenzo Stoakes wrote: > Nits on subject: > > - It's >75 chars No big deal. If we cna come up with something shorter, good. > - advise is the verb, advice is the noun. Yeah. > > On Thu, Jul 31, 2025 at 01:27:20PM +0100, Usama Arif wrote: >> From: David Hildenbrand <david@redhat.com> >> >> Let's allow for making MADV_COLLAPSE succeed on areas that neither have >> VM_HUGEPAGE nor VM_NOHUGEPAGE when we have THP disabled >> unless explicitly advised (PR_THP_DISABLE_EXCEPT_ADVISED). > > Hmm, I'm not sure about this. > > So far this prctl() has been the only way to override MADV_COLLAPSE > behaviour, but now we're allowing for this one case to not. This is not an override really. prctl() disallowed MADV_COLLAPSE, but in the new mode we don't want that anymore. > > I suppose the precedent is that MADV_COLLAPSE overrides 'madvise' sysfs > behaviour. > > I suppose what saves us here is 'advised' can be read to mean either > MADV_HUGEPAGE or MADV_COLLAPSE. > > And yes, MADV_COLLAPSE is clearly the user requesting this behaviour. Exactly. > > I think the vagueness here is one that already existed, because one could > perfectly one have expected MADV_COLLAPSE to obey sysfs and require > MADV_HUGEPAGE to have been applied, but of course this is not the case. Yes. > > OK so fine. > > BUT. > > I think the MADV_COLLAPSE man page will need to be updated to mention this. > Yes. > And I REALLY think we should update the THP doc too to mention all these > prctl() modes. > > I'm not sure we cover that right now _at all_ and obviously we should > describe the new flags. > > Usama - can you add a patch to this series to do that? Good point, let's document the interaction with prctl(). > >> >> MADV_COLLAPSE is a clear advise that we want to collapse. > > advise -> advice. > >> >> Note that we still respect the VM_NOHUGEPAGE flag, just like >> MADV_COLLAPSE always does. So consequently, MADV_COLLAPSE is now only >> refused on VM_NOHUGEPAGE with PR_THP_DISABLE_EXCEPT_ADVISED. > > You also need to mention the shmem change you've made I think. Yes. > >> >> Co-developed-by: Usama Arif <usamaarif642@gmail.com> >> Signed-off-by: Usama Arif <usamaarif642@gmail.com> >> Signed-off-by: David Hildenbrand <david@redhat.com> >> --- >> include/linux/huge_mm.h | 8 +++++++- >> include/uapi/linux/prctl.h | 2 +- >> mm/huge_memory.c | 5 +++-- >> mm/memory.c | 6 ++++-- >> mm/shmem.c | 2 +- >> 5 files changed, 16 insertions(+), 7 deletions(-) >> >> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h >> index b0ff54eee81c..aeaf93f8ac2e 100644 >> --- a/include/linux/huge_mm.h >> +++ b/include/linux/huge_mm.h >> @@ -329,7 +329,7 @@ struct thpsize { >> * through madvise or prctl. >> */ >> static inline bool vma_thp_disabled(struct vm_area_struct *vma, >> - vm_flags_t vm_flags) >> + vm_flags_t vm_flags, bool forced_collapse) >> { >> /* Are THPs disabled for this VMA? */ >> if (vm_flags & VM_NOHUGEPAGE) >> @@ -343,6 +343,12 @@ static inline bool vma_thp_disabled(struct vm_area_struct *vma, >> */ >> if (vm_flags & VM_HUGEPAGE) >> return false; >> + /* >> + * Forcing a collapse (e.g., madv_collapse), is a clear advise to > > advise -> advice. > >> + * use THPs. >> + */ >> + if (forced_collapse) >> + return false; >> return test_bit(MMF_DISABLE_THP_EXCEPT_ADVISED, &vma->vm_mm->flags); >> } >> >> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h >> index 9c1d6e49b8a9..ee4165738779 100644 >> --- a/include/uapi/linux/prctl.h >> +++ b/include/uapi/linux/prctl.h >> @@ -185,7 +185,7 @@ struct prctl_mm_map { >> #define PR_SET_THP_DISABLE 41 >> /* >> * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE / >> - * VM_HUGEPAGE). >> + * VM_HUGEPAGE / MADV_COLLAPSE). > > This is confusing you're mixing VMA flags with MADV ones... maybe just > stick to madvise ones, or add extra context around VM_HUGEPAGE bit? I don't see anything confusing here, really. But if it helps you, we can do (e.g., MADV_HUGEPAGE / VM_HUGEPAGE, MADV_COLLAPSE). (reason VM_HUGEPAGE is spelled out is that there might be code where we set VM_HUGEPAGE implicitly in the kernel) > > Would need to be fixed up in a prior commit obviously. > >> */ >> # define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) >> #define PR_GET_THP_DISABLE 42 >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index 85252b468f80..ef5ccb0ec5d5 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -104,7 +104,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, >> { >> const bool smaps = type == TVA_SMAPS; >> const bool in_pf = type == TVA_PAGEFAULT; >> - const bool enforce_sysfs = type != TVA_FORCED_COLLAPSE; >> + const bool forced_collapse = type == TVA_FORCED_COLLAPSE; >> + const bool enforce_sysfs = !forced_collapse; > > Can we just get rid of this enforce_sysfs altogether in patch 2/5 and use > forced_collapse? Let's do that as a separate cleanup on top. I want least churn in that patch. (had the same idea while writing that patch, but I have other things to focus on than cleaning up all this mess) -- Cheers, David / dhildenb
On Thu, Jul 31, 2025 at 04:54:38PM +0200, David Hildenbrand wrote: > On 31.07.25 16:38, Lorenzo Stoakes wrote: > > Nits on subject: > > > > - It's >75 chars > > No big deal. If we cna come up with something shorter, good. > > > - advise is the verb, advice is the noun. > > Yeah. > > > > > On Thu, Jul 31, 2025 at 01:27:20PM +0100, Usama Arif wrote: > > > From: David Hildenbrand <david@redhat.com> > > > > > > Let's allow for making MADV_COLLAPSE succeed on areas that neither have > > > VM_HUGEPAGE nor VM_NOHUGEPAGE when we have THP disabled > > > unless explicitly advised (PR_THP_DISABLE_EXCEPT_ADVISED). > > > > Hmm, I'm not sure about this. > > > > So far this prctl() has been the only way to override MADV_COLLAPSE > > behaviour, but now we're allowing for this one case to not. > > This is not an override really. prctl() disallowed MADV_COLLAPSE, but in the > new mode we don't want that anymore. Yeah see below, I sort of convinced myself this is OK. > > > > I suppose the precedent is that MADV_COLLAPSE overrides 'madvise' sysfs > > behaviour. > > > I suppose what saves us here is 'advised' can be read to mean either > > MADV_HUGEPAGE or MADV_COLLAPSE. > > > And yes, MADV_COLLAPSE is clearly the user requesting this behaviour. > > Exactly. And really I guess it's logical right? If you MADV_COLLAPSE you are saying 'I want THPs', if you MADV_HUGEPAGE you are saying 'I want THPs', the difference being on timing. > > > > > I think the vagueness here is one that already existed, because one could > > perfectly one have expected MADV_COLLAPSE to obey sysfs and require > > MADV_HUGEPAGE to have been applied, but of course this is not the case. > > Yes. > > > > > OK so fine. > > > > BUT. > > > > I think the MADV_COLLAPSE man page will need to be updated to mention this. > > > > Yes. > > > And I REALLY think we should update the THP doc too to mention all these > > prctl() modes. > > > > I'm not sure we cover that right now _at all_ and obviously we should > > describe the new flags. > > > > Usama - can you add a patch to this series to do that? > > Good point, let's document the interaction with prctl(). Thanks, I think we haven't even spoken that much about MADV_COLLAPSE in the docs, somebody had a patch but then it fizzled out. But that's a separate task, will add to my TODOs in case nobody else picks up. > > > > > > > > > MADV_COLLAPSE is a clear advise that we want to collapse. > > > > advise -> advice. > > > > > > > > Note that we still respect the VM_NOHUGEPAGE flag, just like > > > MADV_COLLAPSE always does. So consequently, MADV_COLLAPSE is now only > > > refused on VM_NOHUGEPAGE with PR_THP_DISABLE_EXCEPT_ADVISED. > > > > You also need to mention the shmem change you've made I think. > > Yes. > > > >> > > > Co-developed-by: Usama Arif <usamaarif642@gmail.com> > > > Signed-off-by: Usama Arif <usamaarif642@gmail.com> > > > Signed-off-by: David Hildenbrand <david@redhat.com> > > > --- > > > include/linux/huge_mm.h | 8 +++++++- > > > include/uapi/linux/prctl.h | 2 +- > > > mm/huge_memory.c | 5 +++-- > > > mm/memory.c | 6 ++++-- > > > mm/shmem.c | 2 +- > > > 5 files changed, 16 insertions(+), 7 deletions(-) > > > > > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > > > index b0ff54eee81c..aeaf93f8ac2e 100644 > > > --- a/include/linux/huge_mm.h > > > +++ b/include/linux/huge_mm.h > > > @@ -329,7 +329,7 @@ struct thpsize { > > > * through madvise or prctl. > > > */ > > > static inline bool vma_thp_disabled(struct vm_area_struct *vma, > > > - vm_flags_t vm_flags) > > > + vm_flags_t vm_flags, bool forced_collapse) > > > { > > > /* Are THPs disabled for this VMA? */ > > > if (vm_flags & VM_NOHUGEPAGE) > > > @@ -343,6 +343,12 @@ static inline bool vma_thp_disabled(struct vm_area_struct *vma, > > > */ > > > if (vm_flags & VM_HUGEPAGE) > > > return false; > > > + /* > > > + * Forcing a collapse (e.g., madv_collapse), is a clear advise to > > > > advise -> advice. > > > > > + * use THPs. > > > + */ > > > + if (forced_collapse) > > > + return false; > > > return test_bit(MMF_DISABLE_THP_EXCEPT_ADVISED, &vma->vm_mm->flags); > > > } > > > > > > diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h > > > index 9c1d6e49b8a9..ee4165738779 100644 > > > --- a/include/uapi/linux/prctl.h > > > +++ b/include/uapi/linux/prctl.h > > > @@ -185,7 +185,7 @@ struct prctl_mm_map { > > > #define PR_SET_THP_DISABLE 41 > > > /* > > > * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE / > > > - * VM_HUGEPAGE). > > > + * VM_HUGEPAGE / MADV_COLLAPSE). > > > > This is confusing you're mixing VMA flags with MADV ones... maybe just > > stick to madvise ones, or add extra context around VM_HUGEPAGE bit? > > I don't see anything confusing here, really. > > But if it helps you, we can do > (e.g., MADV_HUGEPAGE / VM_HUGEPAGE, MADV_COLLAPSE). > > (reason VM_HUGEPAGE is spelled out is that there might be code where we set > VM_HUGEPAGE implicitly in the kernel) Yeah to be clear this is a pretty minor point, and I see why we'd want to mention VM_HUGEPAGE explicitly, as it is in fact this that overrides THP being disabled for the process. > > > > > Would need to be fixed up in a prior commit obviously. > > > > > */ > > > # define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) > > > #define PR_GET_THP_DISABLE 42 > > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > > > index 85252b468f80..ef5ccb0ec5d5 100644 > > > --- a/mm/huge_memory.c > > > +++ b/mm/huge_memory.c > > > @@ -104,7 +104,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, > > > { > > > const bool smaps = type == TVA_SMAPS; > > > const bool in_pf = type == TVA_PAGEFAULT; > > > - const bool enforce_sysfs = type != TVA_FORCED_COLLAPSE; > > > + const bool forced_collapse = type == TVA_FORCED_COLLAPSE; > > > + const bool enforce_sysfs = !forced_collapse; > > > > Can we just get rid of this enforce_sysfs altogether in patch 2/5 and use > > forced_collapse? > > Let's do that as a separate cleanup on top. I want least churn in that > patch. > > (had the same idea while writing that patch, but I have other things to > focus on than cleaning up all this mess) Ack, I'm fine with that. > > -- > Cheers, > > David / dhildenb > Cheers, Lorenzo
On 31/07/2025 15:54, David Hildenbrand wrote: > On 31.07.25 16:38, Lorenzo Stoakes wrote: >> Nits on subject: >> >> - It's >75 chars > > No big deal. If we cna come up with something shorter, good. Changed it to "mm/huge_memory: respect MADV_COLLAPSE with PR_THP_DISABLE_EXCEPT_ADVISED" for the next revision. That would be 73 chars :) > >> - advise is the verb, advice is the noun. > > Yeah. > >> >> On Thu, Jul 31, 2025 at 01:27:20PM +0100, Usama Arif wrote: >>> From: David Hildenbrand <david@redhat.com> >>> >>> Let's allow for making MADV_COLLAPSE succeed on areas that neither have >>> VM_HUGEPAGE nor VM_NOHUGEPAGE when we have THP disabled >>> unless explicitly advised (PR_THP_DISABLE_EXCEPT_ADVISED). >> >> Hmm, I'm not sure about this. >> >> So far this prctl() has been the only way to override MADV_COLLAPSE >> behaviour, but now we're allowing for this one case to not. > > This is not an override really. prctl() disallowed MADV_COLLAPSE, but in the new mode we don't want that anymore. > >> > I suppose the precedent is that MADV_COLLAPSE overrides 'madvise' sysfs >> behaviour. >> > I suppose what saves us here is 'advised' can be read to mean either >> MADV_HUGEPAGE or MADV_COLLAPSE. >> > And yes, MADV_COLLAPSE is clearly the user requesting this behaviour. > > Exactly. > >> >> I think the vagueness here is one that already existed, because one could >> perfectly one have expected MADV_COLLAPSE to obey sysfs and require >> MADV_HUGEPAGE to have been applied, but of course this is not the case. > > Yes. > >> >> OK so fine. >> >> BUT. >> >> I think the MADV_COLLAPSE man page will need to be updated to mention this. >> > > Yes. > Thanks, yes will do and send this along with changes to prctl man page after this makes into mm-stable. >> And I REALLY think we should update the THP doc too to mention all these >> prctl() modes. >> >> I'm not sure we cover that right now _at all_ and obviously we should >> describe the new flags. >> >> Usama - can you add a patch to this series to do that? > > Good point, let's document the interaction with prctl(). > I have added the following patch for the next revision. I know that a lot of this will be in the man page as well, but I have gone the way of being very very explicit of what are all the possible calls that can be made (hopefully thats the right approach :)) commit 5f290d29741a514d0861d0f99c8b860ba6af9c37 Author: Usama Arif <usamaarif642@gmail.com> Date: Fri Aug 1 12:05:49 2025 +0100 docs: transhuge: document process level THP controls This includes the PR_SET_THP_DISABLE/PR_GET_THP_DISABLE pair of prctl calls as well the newly introduced PR_THP_DISABLE_EXCEPT_ADVISED flag for the PR_SET_THP_DISABLE prctl call. Signed-off-by: Usama Arif <usamaarif642@gmail.com> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 370fba113460..cce0a99beac8 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -225,6 +225,45 @@ to "always" or "madvise"), and it'll be automatically shutdown when PMD-sized THP is disabled (when both the per-size anon control and the top-level control are "never") +process THP controls +-------------------- + +A process can control its own THP behaviour using the ``PR_SET_THP_DISABLE`` +and ``PR_GET_THP_DISABLE`` pair of prctl(2) calls. These calls support the +following arguments:: + + prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0): + This will set the MMF_DISABLE_THP_COMPLETELY mm flag which will + result in no THPs being faulted in or collapsed, irrespective + of global THP controls. This flag and hence the behaviour is + inherited across fork(2) and execve(2). + + prctl(PR_SET_THP_DISABLE, 1, PR_THP_DISABLE_EXCEPT_ADVISED, 0, 0): + This will set the MMF_DISABLE_THP_EXCEPT_ADVISED mm flag which + will result in THPs being faulted in or collapsed only for + the following cases: + - Global THP controls are set to "always" or "madvise" and + the process has madvised the region with either MADV_HUGEPAGE + or MADV_COLLAPSE. + - Global THP controls is set to "never" and the process has + madvised the region with MADV_COLLAPSE. + This flag and hence the behaviour is inherited across fork(2) + and execve(2). + + prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0): + This will clear the MMF_DISABLE_THP_COMPLETELY and + MMF_DISABLE_THP_EXCEPT_ADVISED mm flags. The process will + behave according to the global THP controls. This behaviour + will be inherited across fork(2) and execve(2). + + prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0): + This will return the THP disable mm flag status of the process + that was set by prctl(PR_SET_THP_DISABLE, ...). + i.e. + - 1 if MMF_DISABLE_THP_COMPLETELY flag is set + - 3 if MMF_DISABLE_THP_EXCEPT_ADVISED flag is set + - 0 otherwise. + Khugepaged controls ------------------- >> >>> >>> MADV_COLLAPSE is a clear advise that we want to collapse. >> >> advise -> advice. >> >>> >>> Note that we still respect the VM_NOHUGEPAGE flag, just like >>> MADV_COLLAPSE always does. So consequently, MADV_COLLAPSE is now only >>> refused on VM_NOHUGEPAGE with PR_THP_DISABLE_EXCEPT_ADVISED. >> >> You also need to mention the shmem change you've made I think. > > Yes. > >> >> >>> Co-developed-by: Usama Arif <usamaarif642@gmail.com> >>> Signed-off-by: Usama Arif <usamaarif642@gmail.com> >>> Signed-off-by: David Hildenbrand <david@redhat.com> >>> --- >>> include/linux/huge_mm.h | 8 +++++++- >>> include/uapi/linux/prctl.h | 2 +- >>> mm/huge_memory.c | 5 +++-- >>> mm/memory.c | 6 ++++-- >>> mm/shmem.c | 2 +- >>> 5 files changed, 16 insertions(+), 7 deletions(-) >>> >>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h >>> index b0ff54eee81c..aeaf93f8ac2e 100644 >>> --- a/include/linux/huge_mm.h >>> +++ b/include/linux/huge_mm.h >>> @@ -329,7 +329,7 @@ struct thpsize { >>> * through madvise or prctl. >>> */ >>> static inline bool vma_thp_disabled(struct vm_area_struct *vma, >>> - vm_flags_t vm_flags) >>> + vm_flags_t vm_flags, bool forced_collapse) >>> { >>> /* Are THPs disabled for this VMA? */ >>> if (vm_flags & VM_NOHUGEPAGE) >>> @@ -343,6 +343,12 @@ static inline bool vma_thp_disabled(struct vm_area_struct *vma, >>> */ >>> if (vm_flags & VM_HUGEPAGE) >>> return false; >>> + /* >>> + * Forcing a collapse (e.g., madv_collapse), is a clear advise to >> >> advise -> advice. >> >>> + * use THPs. >>> + */ >>> + if (forced_collapse) >>> + return false; >>> return test_bit(MMF_DISABLE_THP_EXCEPT_ADVISED, &vma->vm_mm->flags); >>> } >>> >>> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h >>> index 9c1d6e49b8a9..ee4165738779 100644 >>> --- a/include/uapi/linux/prctl.h >>> +++ b/include/uapi/linux/prctl.h >>> @@ -185,7 +185,7 @@ struct prctl_mm_map { >>> #define PR_SET_THP_DISABLE 41 >>> /* >>> * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE / >>> - * VM_HUGEPAGE). >>> + * VM_HUGEPAGE / MADV_COLLAPSE). >> >> This is confusing you're mixing VMA flags with MADV ones... maybe just >> stick to madvise ones, or add extra context around VM_HUGEPAGE bit? > > I don't see anything confusing here, really. > > But if it helps you, we can do > (e.g., MADV_HUGEPAGE / VM_HUGEPAGE, MADV_COLLAPSE). > > (reason VM_HUGEPAGE is spelled out is that there might be code where we set VM_HUGEPAGE implicitly in the kernel) > >> >> Would need to be fixed up in a prior commit obviously. >> >>> */ >>> # define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) >>> #define PR_GET_THP_DISABLE 42 >>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >>> index 85252b468f80..ef5ccb0ec5d5 100644 >>> --- a/mm/huge_memory.c >>> +++ b/mm/huge_memory.c >>> @@ -104,7 +104,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, >>> { >>> const bool smaps = type == TVA_SMAPS; >>> const bool in_pf = type == TVA_PAGEFAULT; >>> - const bool enforce_sysfs = type != TVA_FORCED_COLLAPSE; >>> + const bool forced_collapse = type == TVA_FORCED_COLLAPSE; >>> + const bool enforce_sysfs = !forced_collapse; >> >> Can we just get rid of this enforce_sysfs altogether in patch 2/5 and use >> forced_collapse? > > Let's do that as a separate cleanup on top. I want least churn in that patch. > > (had the same idea while writing that patch, but I have other things to focus on than cleaning up all this mess) > I am happy to send this cleanup once this series makes it to mm-new and a new revision is not expected. Thanks! Usama
© 2016 - 2025 Red Hat, Inc.