The ceiling and tree search limit need to be different arguments for the
future change in the failed fork attempt.
Add some documentation around free_pgtables() and the limits in an
attempt to clarify the floor and ceiling use as well as the new
tree_max.
Test code also updated.
No functional changes intended.
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
---
mm/internal.h | 4 +++-
mm/memory.c | 28 +++++++++++++++++++++++++---
mm/mmap.c | 2 +-
mm/vma.c | 3 ++-
tools/testing/vma/vma_internal.h | 3 ++-
5 files changed, 33 insertions(+), 7 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index 63e3ec8d63be7..d295252407fee 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -444,7 +444,9 @@ void folio_activate(struct folio *folio);
void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *start_vma, unsigned long floor,
- unsigned long ceiling, bool mm_wr_locked);
+ unsigned long ceiling, unsigned long tree_max,
+ bool mm_wr_locked);
+
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
struct zap_details;
diff --git a/mm/memory.c b/mm/memory.c
index 3e0404bd57a02..24716b3713f66 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -369,12 +369,34 @@ void free_pgd_range(struct mmu_gather *tlb,
} while (pgd++, addr = next, addr != end);
}
+/*
+ * free_pgtables() - Free a range of page tables
+ * @tlb: The mmu gather
+ * @mas: The maple state
+ * @vma: The first vma
+ * @floor: The lowest page table address
+ * @ceiling: The highest page table address
+ * @tree_max: The highest tree search address
+ * @mm_wr_locked: boolean indicating if the mm is write locked
+ *
+ * Note: Floor and ceiling are provided to indicate the absolute range of the
+ * page tables that should be removed. This can differ from the vma mappings on
+ * some archs that may have mappings that need to be removed outside the vmas.
+ * Note that the prev->vm_end and next->vm_start are often used.
+ *
+ * The tree_max differs from the ceiling when a dup_mmap() failed and the tree
+ * has unrelated data to the mm_struct being torn down.
+ */
void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *vma, unsigned long floor,
- unsigned long ceiling, bool mm_wr_locked)
+ unsigned long ceiling, unsigned long tree_max,
+ bool mm_wr_locked)
{
struct unlink_vma_file_batch vb;
+ /* underflow can happen and is fine */
+ WARN_ON_ONCE(tree_max - 1 > ceiling - 1);
+
tlb_free_vmas(tlb);
do {
@@ -385,7 +407,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
* Note: USER_PGTABLES_CEILING may be passed as ceiling and may
* be 0. This will underflow and is okay.
*/
- next = mas_find(mas, ceiling - 1);
+ next = mas_find(mas, tree_max - 1);
if (unlikely(xa_is_zero(next)))
next = NULL;
@@ -405,7 +427,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
*/
while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
vma = next;
- next = mas_find(mas, ceiling - 1);
+ next = mas_find(mas, tree_max - 1);
if (unlikely(xa_is_zero(next)))
next = NULL;
if (mm_wr_locked)
diff --git a/mm/mmap.c b/mm/mmap.c
index a290448a53bb2..0f4808f135fe6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1311,7 +1311,7 @@ void exit_mmap(struct mm_struct *mm)
mt_clear_in_rcu(&mm->mm_mt);
vma_iter_set(&vmi, vma->vm_end);
free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
- USER_PGTABLES_CEILING, true);
+ USER_PGTABLES_CEILING, USER_PGTABLES_CEILING, true);
tlb_finish_mmu(&tlb);
/*
diff --git a/mm/vma.c b/mm/vma.c
index a648e0555c873..1bae142bbc0f1 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -486,6 +486,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
/* mm_wr_locked = */ true);
mas_set(mas, vma->vm_end);
free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+ next ? next->vm_start : USER_PGTABLES_CEILING,
next ? next->vm_start : USER_PGTABLES_CEILING,
/* mm_wr_locked = */ true);
tlb_finish_mmu(&tlb);
@@ -1232,7 +1233,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
mas_set(mas_detach, 1);
/* start and end may be different if there is no prev or next vma. */
free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
- vms->unmap_end, mm_wr_locked);
+ vms->unmap_end, vms->unmap_end, mm_wr_locked);
tlb_finish_mmu(&tlb);
vms->clear_ptes = false;
}
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 07167446dcf42..823d379e1fac2 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -900,7 +900,8 @@ static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
struct vm_area_struct *vma, unsigned long floor,
- unsigned long ceiling, bool mm_wr_locked)
+ unsigned long ceiling, unsigned long tree_max,
+ bool mm_wr_locked)
{
(void)tlb;
(void)mas;
--
2.47.2
> +/* > + * free_pgtables() - Free a range of page tables > + * @tlb: The mmu gather > + * @mas: The maple state > + * @vma: The first vma > + * @floor: The lowest page table address > + * @ceiling: The highest page table address > + * @tree_max: The highest tree search address > + * @mm_wr_locked: boolean indicating if the mm is write locked I'm sure there is a good reason why we don't call this stuff here floor -> start ceiling -> end tree_max -> tree_end -- Cheers David / dhildenb
* David Hildenbrand <david@redhat.com> [250911 05:24]: > > +/* > > + * free_pgtables() - Free a range of page tables > > + * @tlb: The mmu gather > > + * @mas: The maple state > > + * @vma: The first vma > > + * @floor: The lowest page table address > > + * @ceiling: The highest page table address > > + * @tree_max: The highest tree search address > > + * @mm_wr_locked: boolean indicating if the mm is write locked > > I'm sure there is a good reason why we don't call this stuff here > > floor -> start > ceiling -> end > tree_max -> tree_end It may not be the tree end. The floor and ceiling names were here before I arrived. I think the floor and ceiling comes from the arch definition part: USER_PGTABLES_CEILING, for example.
On Tue, Sep 09, 2025 at 03:09:40PM -0400, Liam R. Howlett wrote:
> The ceiling and tree search limit need to be different arguments for the
> future change in the failed fork attempt.
>
> Add some documentation around free_pgtables() and the limits in an
> attempt to clarify the floor and ceiling use as well as the new
> tree_max.
>
> Test code also updated.
>
> No functional changes intended.
>
> Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
LGTM other than the nits below, so with those addressed feel free to add:
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> mm/internal.h | 4 +++-
> mm/memory.c | 28 +++++++++++++++++++++++++---
> mm/mmap.c | 2 +-
> mm/vma.c | 3 ++-
> tools/testing/vma/vma_internal.h | 3 ++-
> 5 files changed, 33 insertions(+), 7 deletions(-)
>
> diff --git a/mm/internal.h b/mm/internal.h
> index 63e3ec8d63be7..d295252407fee 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -444,7 +444,9 @@ void folio_activate(struct folio *folio);
>
> void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> struct vm_area_struct *start_vma, unsigned long floor,
> - unsigned long ceiling, bool mm_wr_locked);
> + unsigned long ceiling, unsigned long tree_max,
> + bool mm_wr_locked);
> +
> void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
>
> struct zap_details;
> diff --git a/mm/memory.c b/mm/memory.c
> index 3e0404bd57a02..24716b3713f66 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -369,12 +369,34 @@ void free_pgd_range(struct mmu_gather *tlb,
> } while (pgd++, addr = next, addr != end);
> }
>
> +/*
NIT: /** right? This looks like a kernel-doc to me!
> + * free_pgtables() - Free a range of page tables
> + * @tlb: The mmu gather
> + * @mas: The maple state
> + * @vma: The first vma
> + * @floor: The lowest page table address
> + * @ceiling: The highest page table address
> + * @tree_max: The highest tree search address
> + * @mm_wr_locked: boolean indicating if the mm is write locked
> + *
> + * Note: Floor and ceiling are provided to indicate the absolute range of the
> + * page tables that should be removed. This can differ from the vma mappings on
> + * some archs that may have mappings that need to be removed outside the vmas.
> + * Note that the prev->vm_end and next->vm_start are often used.
Great write-up though you are missing some horrified noises re: the arches doing
that, I guess the reader has to play them back in their head... ;)
> + *
> + * The tree_max differs from the ceiling when a dup_mmap() failed and the tree
> + * has unrelated data to the mm_struct being torn down.
> + */
Ohhh nice nice thanks for adding this comment!
> void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> struct vm_area_struct *vma, unsigned long floor,
> - unsigned long ceiling, bool mm_wr_locked)
> + unsigned long ceiling, unsigned long tree_max,
> + bool mm_wr_locked)
> {
> struct unlink_vma_file_batch vb;
>
> + /* underflow can happen and is fine */
I am taking this as a sign that underflow is in fact fine _everywhere_
including in internal plumbing! :P
> + WARN_ON_ONCE(tree_max - 1 > ceiling - 1);
Hmm, so if tree_max == 1, and ceilling == 0, we're ok with that (would
resolve to tree_max = 0 and ceiling == ULONG_MAX), I guess relating to the
'interpret 0 as everything' semantics I think we have for ceiling?
I guess it's because these are exclusive.
So perhaps worth updating comment to:
/*
* these values are exclusive bounds, with 0 being interpreted as the
* entire range, so underflow is fine.
*/
or similar, just to really underline that...
> +
> tlb_free_vmas(tlb);
>
> do {
> @@ -385,7 +407,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
> * be 0. This will underflow and is okay.
> */
> - next = mas_find(mas, ceiling - 1);
> + next = mas_find(mas, tree_max - 1);
> if (unlikely(xa_is_zero(next)))
> next = NULL;
>
> @@ -405,7 +427,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> */
> while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
> vma = next;
> - next = mas_find(mas, ceiling - 1);
> + next = mas_find(mas, tree_max - 1);
> if (unlikely(xa_is_zero(next)))
> next = NULL;
> if (mm_wr_locked)
> diff --git a/mm/mmap.c b/mm/mmap.c
> index a290448a53bb2..0f4808f135fe6 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1311,7 +1311,7 @@ void exit_mmap(struct mm_struct *mm)
> mt_clear_in_rcu(&mm->mm_mt);
> vma_iter_set(&vmi, vma->vm_end);
> free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
> - USER_PGTABLES_CEILING, true);
> + USER_PGTABLES_CEILING, USER_PGTABLES_CEILING, true);
NIT: Might be nice, while we're here, to add your (very nice) convention of
prefacing boolean params with the param name, so here:
..., /* mm_wr_locked= */ true);
> tlb_finish_mmu(&tlb);
>
> /*
> diff --git a/mm/vma.c b/mm/vma.c
> index a648e0555c873..1bae142bbc0f1 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -486,6 +486,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
> /* mm_wr_locked = */ true);
> mas_set(mas, vma->vm_end);
> free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
> + next ? next->vm_start : USER_PGTABLES_CEILING,
> next ? next->vm_start : USER_PGTABLES_CEILING,
> /* mm_wr_locked = */ true);
> tlb_finish_mmu(&tlb);
> @@ -1232,7 +1233,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
> mas_set(mas_detach, 1);
> /* start and end may be different if there is no prev or next vma. */
> free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
> - vms->unmap_end, mm_wr_locked);
> + vms->unmap_end, vms->unmap_end, mm_wr_locked);
> tlb_finish_mmu(&tlb);
> vms->clear_ptes = false;
> }
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index 07167446dcf42..823d379e1fac2 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -900,7 +900,8 @@ static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
>
> static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> struct vm_area_struct *vma, unsigned long floor,
> - unsigned long ceiling, bool mm_wr_locked)
> + unsigned long ceiling, unsigned long tree_max,
> + bool mm_wr_locked)
> {
> (void)tlb;
> (void)mas;
> --
> 2.47.2
>
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250911 05:08]:
> On Tue, Sep 09, 2025 at 03:09:40PM -0400, Liam R. Howlett wrote:
> > The ceiling and tree search limit need to be different arguments for the
> > future change in the failed fork attempt.
> >
> > Add some documentation around free_pgtables() and the limits in an
> > attempt to clarify the floor and ceiling use as well as the new
> > tree_max.
> >
> > Test code also updated.
> >
> > No functional changes intended.
> >
> > Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
>
> LGTM other than the nits below, so with those addressed feel free to add:
>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>
> > ---
> > mm/internal.h | 4 +++-
> > mm/memory.c | 28 +++++++++++++++++++++++++---
> > mm/mmap.c | 2 +-
> > mm/vma.c | 3 ++-
> > tools/testing/vma/vma_internal.h | 3 ++-
> > 5 files changed, 33 insertions(+), 7 deletions(-)
> >
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 63e3ec8d63be7..d295252407fee 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -444,7 +444,9 @@ void folio_activate(struct folio *folio);
> >
> > void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> > struct vm_area_struct *start_vma, unsigned long floor,
> > - unsigned long ceiling, bool mm_wr_locked);
> > + unsigned long ceiling, unsigned long tree_max,
> > + bool mm_wr_locked);
> > +
> > void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
> >
> > struct zap_details;
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 3e0404bd57a02..24716b3713f66 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -369,12 +369,34 @@ void free_pgd_range(struct mmu_gather *tlb,
> > } while (pgd++, addr = next, addr != end);
> > }
> >
> > +/*
>
> NIT: /** right? This looks like a kernel-doc to me!
>
> > + * free_pgtables() - Free a range of page tables
> > + * @tlb: The mmu gather
> > + * @mas: The maple state
> > + * @vma: The first vma
> > + * @floor: The lowest page table address
> > + * @ceiling: The highest page table address
> > + * @tree_max: The highest tree search address
> > + * @mm_wr_locked: boolean indicating if the mm is write locked
> > + *
> > + * Note: Floor and ceiling are provided to indicate the absolute range of the
> > + * page tables that should be removed. This can differ from the vma mappings on
> > + * some archs that may have mappings that need to be removed outside the vmas.
> > + * Note that the prev->vm_end and next->vm_start are often used.
>
> Great write-up though you are missing some horrified noises re: the arches doing
> that, I guess the reader has to play them back in their head... ;)
>
> > + *
> > + * The tree_max differs from the ceiling when a dup_mmap() failed and the tree
> > + * has unrelated data to the mm_struct being torn down.
> > + */
>
> Ohhh nice nice thanks for adding this comment!
>
> > void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> > struct vm_area_struct *vma, unsigned long floor,
> > - unsigned long ceiling, bool mm_wr_locked)
> > + unsigned long ceiling, unsigned long tree_max,
> > + bool mm_wr_locked)
> > {
> > struct unlink_vma_file_batch vb;
> >
> > + /* underflow can happen and is fine */
>
> I am taking this as a sign that underflow is in fact fine _everywhere_
> including in internal plumbing! :P
>
> > + WARN_ON_ONCE(tree_max - 1 > ceiling - 1);
>
> Hmm, so if tree_max == 1, and ceilling == 0, we're ok with that (would
> resolve to tree_max = 0 and ceiling == ULONG_MAX), I guess relating to the
> 'interpret 0 as everything' semantics I think we have for ceiling?
>
> I guess it's because these are exclusive.
>
> So perhaps worth updating comment to:
>
> /*
> * these values are exclusive bounds, with 0 being interpreted as the
> * entire range, so underflow is fine.
> */
>
> or similar, just to really underline that...
The tree_max = 0 makes no sense and will be evaluated as a noop in the
code - we will fail to find any vmas to iterate over. We cannot have a
vma starting at 0 and running to 0.
tree_max is not an exclusive bound before this function - which uses the
maple state (mas_find) vs the vma iterator, which does the translation
for us.
ceiling has always had the potential of being 0 from the #define of
USER_PGTABLES_CEILING.
If we were to use the vma iterator, then we're having an inclusive limit
and the start/end (named floor and ceiling here..) difference. So I'm
just trying to keep everything here on the same level of reference. We
need to subtract from both values (especially since they were the same
variable before). This seems to make code easier to read, especially in
this diff.
>
> > +
> > tlb_free_vmas(tlb);
> >
> > do {
> > @@ -385,7 +407,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> > * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
> > * be 0. This will underflow and is okay.
> > */
> > - next = mas_find(mas, ceiling - 1);
> > + next = mas_find(mas, tree_max - 1);
> > if (unlikely(xa_is_zero(next)))
> > next = NULL;
> >
> > @@ -405,7 +427,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> > */
> > while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
> > vma = next;
> > - next = mas_find(mas, ceiling - 1);
> > + next = mas_find(mas, tree_max - 1);
> > if (unlikely(xa_is_zero(next)))
> > next = NULL;
> > if (mm_wr_locked)
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index a290448a53bb2..0f4808f135fe6 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1311,7 +1311,7 @@ void exit_mmap(struct mm_struct *mm)
> > mt_clear_in_rcu(&mm->mm_mt);
> > vma_iter_set(&vmi, vma->vm_end);
> > free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
> > - USER_PGTABLES_CEILING, true);
> > + USER_PGTABLES_CEILING, USER_PGTABLES_CEILING, true);
>
> NIT: Might be nice, while we're here, to add your (very nice) convention of
> prefacing boolean params with the param name, so here:
>
> ..., /* mm_wr_locked= */ true);
Ah... long line and I kill it later, so I left it off.
>
> > tlb_finish_mmu(&tlb);
> >
> > /*
> > diff --git a/mm/vma.c b/mm/vma.c
> > index a648e0555c873..1bae142bbc0f1 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -486,6 +486,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
> > /* mm_wr_locked = */ true);
> > mas_set(mas, vma->vm_end);
> > free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
> > + next ? next->vm_start : USER_PGTABLES_CEILING,
> > next ? next->vm_start : USER_PGTABLES_CEILING,
> > /* mm_wr_locked = */ true);
> > tlb_finish_mmu(&tlb);
> > @@ -1232,7 +1233,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
> > mas_set(mas_detach, 1);
> > /* start and end may be different if there is no prev or next vma. */
> > free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
> > - vms->unmap_end, mm_wr_locked);
> > + vms->unmap_end, vms->unmap_end, mm_wr_locked);
> > tlb_finish_mmu(&tlb);
> > vms->clear_ptes = false;
> > }
> > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> > index 07167446dcf42..823d379e1fac2 100644
> > --- a/tools/testing/vma/vma_internal.h
> > +++ b/tools/testing/vma/vma_internal.h
> > @@ -900,7 +900,8 @@ static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
> >
> > static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> > struct vm_area_struct *vma, unsigned long floor,
> > - unsigned long ceiling, bool mm_wr_locked)
> > + unsigned long ceiling, unsigned long tree_max,
> > + bool mm_wr_locked)
> > {
> > (void)tlb;
> > (void)mas;
> > --
> > 2.47.2
> >
On Tue, Sep 09, 2025 at 03:09:40PM -0400, Liam R. Howlett wrote: > The ceiling and tree search limit need to be different arguments for the > future change in the failed fork attempt. > > Add some documentation around free_pgtables() and the limits in an > attempt to clarify the floor and ceiling use as well as the new > tree_max. > > Test code also updated. > > No functional changes intended. > > Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com> Reviewed-by: Pedro Falcato <pfalcato@suse.de> -- Pedro
On Tue, Sep 9, 2025 at 12:10 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> The ceiling and tree search limit need to be different arguments for the
> future change in the failed fork attempt.
>
> Add some documentation around free_pgtables() and the limits in an
> attempt to clarify the floor and ceiling use as well as the new
> tree_max.
>
> Test code also updated.
>
> No functional changes intended.
>
> Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
> ---
> mm/internal.h | 4 +++-
> mm/memory.c | 28 +++++++++++++++++++++++++---
> mm/mmap.c | 2 +-
> mm/vma.c | 3 ++-
> tools/testing/vma/vma_internal.h | 3 ++-
> 5 files changed, 33 insertions(+), 7 deletions(-)
>
> diff --git a/mm/internal.h b/mm/internal.h
> index 63e3ec8d63be7..d295252407fee 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -444,7 +444,9 @@ void folio_activate(struct folio *folio);
>
> void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> struct vm_area_struct *start_vma, unsigned long floor,
> - unsigned long ceiling, bool mm_wr_locked);
> + unsigned long ceiling, unsigned long tree_max,
> + bool mm_wr_locked);
> +
> void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
>
> struct zap_details;
> diff --git a/mm/memory.c b/mm/memory.c
> index 3e0404bd57a02..24716b3713f66 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -369,12 +369,34 @@ void free_pgd_range(struct mmu_gather *tlb,
> } while (pgd++, addr = next, addr != end);
> }
>
> +/*
> + * free_pgtables() - Free a range of page tables
> + * @tlb: The mmu gather
> + * @mas: The maple state
> + * @vma: The first vma
> + * @floor: The lowest page table address
> + * @ceiling: The highest page table address
> + * @tree_max: The highest tree search address
> + * @mm_wr_locked: boolean indicating if the mm is write locked
> + *
> + * Note: Floor and ceiling are provided to indicate the absolute range of the
> + * page tables that should be removed. This can differ from the vma mappings on
> + * some archs that may have mappings that need to be removed outside the vmas.
> + * Note that the prev->vm_end and next->vm_start are often used.
> + *
> + * The tree_max differs from the ceiling when a dup_mmap() failed and the tree
> + * has unrelated data to the mm_struct being torn down.
> + */
> void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> struct vm_area_struct *vma, unsigned long floor,
> - unsigned long ceiling, bool mm_wr_locked)
> + unsigned long ceiling, unsigned long tree_max,
> + bool mm_wr_locked)
> {
> struct unlink_vma_file_batch vb;
>
> + /* underflow can happen and is fine */
This comment is a bit confusing... I think the below check covers 2 cases:
1. if ceiling == 0 then tree_max can be anything;
2. if ceiling > 0 then tree_max <= ceiling;
Is that what you intended? If so, maybe amend the comments for the
free_pgtables() function explaining this more explicitly?
> + WARN_ON_ONCE(tree_max - 1 > ceiling - 1);
I would prefer WARN_ON_ONCE(ceiling && tree_max > ceiling); as more
descriptive but that might be just me.
> +
> tlb_free_vmas(tlb);
>
> do {
> @@ -385,7 +407,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
You replaced ceiling with tree_max, so the above comment needs to be
updated as well.
> * be 0. This will underflow and is okay.
> */
> - next = mas_find(mas, ceiling - 1);
> + next = mas_find(mas, tree_max - 1);
> if (unlikely(xa_is_zero(next)))
> next = NULL;
>
> @@ -405,7 +427,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> */
> while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
> vma = next;
> - next = mas_find(mas, ceiling - 1);
> + next = mas_find(mas, tree_max - 1);
> if (unlikely(xa_is_zero(next)))
> next = NULL;
> if (mm_wr_locked)
> diff --git a/mm/mmap.c b/mm/mmap.c
> index a290448a53bb2..0f4808f135fe6 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1311,7 +1311,7 @@ void exit_mmap(struct mm_struct *mm)
> mt_clear_in_rcu(&mm->mm_mt);
> vma_iter_set(&vmi, vma->vm_end);
> free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
> - USER_PGTABLES_CEILING, true);
> + USER_PGTABLES_CEILING, USER_PGTABLES_CEILING, true);
> tlb_finish_mmu(&tlb);
>
> /*
> diff --git a/mm/vma.c b/mm/vma.c
> index a648e0555c873..1bae142bbc0f1 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -486,6 +486,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
> /* mm_wr_locked = */ true);
> mas_set(mas, vma->vm_end);
> free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
> + next ? next->vm_start : USER_PGTABLES_CEILING,
> next ? next->vm_start : USER_PGTABLES_CEILING,
> /* mm_wr_locked = */ true);
> tlb_finish_mmu(&tlb);
> @@ -1232,7 +1233,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
> mas_set(mas_detach, 1);
> /* start and end may be different if there is no prev or next vma. */
> free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
> - vms->unmap_end, mm_wr_locked);
> + vms->unmap_end, vms->unmap_end, mm_wr_locked);
> tlb_finish_mmu(&tlb);
> vms->clear_ptes = false;
> }
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index 07167446dcf42..823d379e1fac2 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -900,7 +900,8 @@ static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
>
> static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> struct vm_area_struct *vma, unsigned long floor,
> - unsigned long ceiling, bool mm_wr_locked)
> + unsigned long ceiling, unsigned long tree_max,
> + bool mm_wr_locked)
> {
> (void)tlb;
> (void)mas;
> --
> 2.47.2
>
* Suren Baghdasaryan <surenb@google.com> [250909 17:06]:
> On Tue, Sep 9, 2025 at 12:10 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > The ceiling and tree search limit need to be different arguments for the
> > future change in the failed fork attempt.
> >
> > Add some documentation around free_pgtables() and the limits in an
> > attempt to clarify the floor and ceiling use as well as the new
> > tree_max.
> >
> > Test code also updated.
> >
> > No functional changes intended.
> >
> > Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
> > ---
> > mm/internal.h | 4 +++-
> > mm/memory.c | 28 +++++++++++++++++++++++++---
> > mm/mmap.c | 2 +-
> > mm/vma.c | 3 ++-
> > tools/testing/vma/vma_internal.h | 3 ++-
> > 5 files changed, 33 insertions(+), 7 deletions(-)
> >
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 63e3ec8d63be7..d295252407fee 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -444,7 +444,9 @@ void folio_activate(struct folio *folio);
> >
> > void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> > struct vm_area_struct *start_vma, unsigned long floor,
> > - unsigned long ceiling, bool mm_wr_locked);
> > + unsigned long ceiling, unsigned long tree_max,
> > + bool mm_wr_locked);
> > +
> > void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
> >
> > struct zap_details;
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 3e0404bd57a02..24716b3713f66 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -369,12 +369,34 @@ void free_pgd_range(struct mmu_gather *tlb,
> > } while (pgd++, addr = next, addr != end);
> > }
> >
> > +/*
> > + * free_pgtables() - Free a range of page tables
> > + * @tlb: The mmu gather
> > + * @mas: The maple state
> > + * @vma: The first vma
> > + * @floor: The lowest page table address
> > + * @ceiling: The highest page table address
> > + * @tree_max: The highest tree search address
> > + * @mm_wr_locked: boolean indicating if the mm is write locked
> > + *
> > + * Note: Floor and ceiling are provided to indicate the absolute range of the
> > + * page tables that should be removed. This can differ from the vma mappings on
> > + * some archs that may have mappings that need to be removed outside the vmas.
> > + * Note that the prev->vm_end and next->vm_start are often used.
> > + *
> > + * The tree_max differs from the ceiling when a dup_mmap() failed and the tree
> > + * has unrelated data to the mm_struct being torn down.
> > + */
> > void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> > struct vm_area_struct *vma, unsigned long floor,
> > - unsigned long ceiling, bool mm_wr_locked)
> > + unsigned long ceiling, unsigned long tree_max,
> > + bool mm_wr_locked)
> > {
> > struct unlink_vma_file_batch vb;
> >
> > + /* underflow can happen and is fine */
>
> This comment is a bit confusing... I think the below check covers 2 cases:
> 1. if ceiling == 0 then tree_max can be anything;
> 2. if ceiling > 0 then tree_max <= ceiling;
> Is that what you intended? If so, maybe amend the comments for the
> free_pgtables() function explaining this more explicitly?
I intended for ceiling to always be larger or equal to tree_max, after
underflow. That does hold true for what you wrote above.
>
> > + WARN_ON_ONCE(tree_max - 1 > ceiling - 1);
>
> I would prefer WARN_ON_ONCE(ceiling && tree_max > ceiling); as more
> descriptive but that might be just me.
Interesting. I wrote it this way to show that the underflow is possible
and since we use tree_max - 1 below, so I thought it more descriptive to
write it as is (and the comment says it's okay to underflow so don't
"fix" that issue).
>
> > +
> > tlb_free_vmas(tlb);
> >
> > do {
> > @@ -385,7 +407,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> > * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
>
> You replaced ceiling with tree_max, so the above comment needs to be
> updated as well.
>
Yes, thanks.
> > * be 0. This will underflow and is okay.
> > */
> > - next = mas_find(mas, ceiling - 1);
> > + next = mas_find(mas, tree_max - 1);
> > if (unlikely(xa_is_zero(next)))
> > next = NULL;
> >
> > @@ -405,7 +427,7 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> > */
> > while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
> > vma = next;
> > - next = mas_find(mas, ceiling - 1);
> > + next = mas_find(mas, tree_max - 1);
> > if (unlikely(xa_is_zero(next)))
> > next = NULL;
> > if (mm_wr_locked)
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index a290448a53bb2..0f4808f135fe6 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1311,7 +1311,7 @@ void exit_mmap(struct mm_struct *mm)
> > mt_clear_in_rcu(&mm->mm_mt);
> > vma_iter_set(&vmi, vma->vm_end);
> > free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
> > - USER_PGTABLES_CEILING, true);
> > + USER_PGTABLES_CEILING, USER_PGTABLES_CEILING, true);
> > tlb_finish_mmu(&tlb);
> >
> > /*
> > diff --git a/mm/vma.c b/mm/vma.c
> > index a648e0555c873..1bae142bbc0f1 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -486,6 +486,7 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
> > /* mm_wr_locked = */ true);
> > mas_set(mas, vma->vm_end);
> > free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
> > + next ? next->vm_start : USER_PGTABLES_CEILING,
> > next ? next->vm_start : USER_PGTABLES_CEILING,
> > /* mm_wr_locked = */ true);
> > tlb_finish_mmu(&tlb);
> > @@ -1232,7 +1233,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
> > mas_set(mas_detach, 1);
> > /* start and end may be different if there is no prev or next vma. */
> > free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
> > - vms->unmap_end, mm_wr_locked);
> > + vms->unmap_end, vms->unmap_end, mm_wr_locked);
> > tlb_finish_mmu(&tlb);
> > vms->clear_ptes = false;
> > }
> > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> > index 07167446dcf42..823d379e1fac2 100644
> > --- a/tools/testing/vma/vma_internal.h
> > +++ b/tools/testing/vma/vma_internal.h
> > @@ -900,7 +900,8 @@ static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
> >
> > static inline void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> > struct vm_area_struct *vma, unsigned long floor,
> > - unsigned long ceiling, bool mm_wr_locked)
> > + unsigned long ceiling, unsigned long tree_max,
> > + bool mm_wr_locked)
> > {
> > (void)tlb;
> > (void)mas;
> > --
> > 2.47.2
> >
© 2016 - 2026 Red Hat, Inc.