[RFC PATCH 6/6] mm: Change dup_mmap() recovery

Liam R. Howlett posted 6 patches 1 month, 2 weeks ago
[RFC PATCH 6/6] mm: Change dup_mmap() recovery
Posted by Liam R. Howlett 1 month, 2 weeks ago
When the dup_mmap() fails during the vma duplication or setup, don't
write the XA_ZERO entry in the vma tree.  Instead, destroy the tree and
free the new resources, leaving an empty vma tree.

Using XA_ZERO introduced races where the vma could be found between
dup_mmap() dropping all locks and exit_mmap() taking the locks.  The
race can occur because the mm can be reached through the other trees
via successfully copied vmas and other methods such as the swapoff code.

XA_ZERO was marking the location to stop vma removal and pagetable
freeing.  The newly created arguments to the unmap_vmas() and
free_pgtables() serve this function.

Replacing the XA_ZERO entry use with the new argument list also means
the checks for xa_is_zero() are no longer necessary so these are also
removed.

Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
---
 mm/memory.c |  6 +-----
 mm/mmap.c   | 39 ++++++++++++++++++++++++++++-----------
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 3346514562bba..8cd58f171bae4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -387,8 +387,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 		 * be 0.  This will underflow and is okay.
 		 */
 		next = mas_find(mas, tree_max - 1);
-		if (unlikely(xa_is_zero(next)))
-			next = NULL;
 
 		/*
 		 * Hide vma from rmap and truncate_pagecache before freeing
@@ -407,8 +405,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 		while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
 			vma = next;
 			next = mas_find(mas, tree_max - 1);
-			if (unlikely(xa_is_zero(next)))
-				next = NULL;
 			if (mm_wr_locked)
 				vma_start_write(vma);
 			unlink_anon_vmas(vma);
@@ -1978,7 +1974,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
 				 mm_wr_locked);
 		hugetlb_zap_end(vma, &details);
 		vma = mas_find(mas, tree_end - 1);
-	} while (vma && likely(!xa_is_zero(vma)));
+	} while (vma);
 	mmu_notifier_invalidate_range_end(&range);
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index eba2bc81bc749..5acc0b5f8c14a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1288,7 +1288,7 @@ void exit_mmap(struct mm_struct *mm)
 	arch_exit_mmap(mm);
 
 	vma = vma_next(&vmi);
-	if (!vma || unlikely(xa_is_zero(vma))) {
+	if (!vma) {
 		/* Can happen if dup_mmap() received an OOM */
 		mmap_read_unlock(mm);
 		mmap_write_lock(mm);
@@ -1858,20 +1858,37 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		ksm_fork(mm, oldmm);
 		khugepaged_fork(mm, oldmm);
 	} else {
+		unsigned long max;
 
 		/*
-		 * The entire maple tree has already been duplicated. If the
-		 * mmap duplication fails, mark the failure point with
-		 * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
-		 * stop releasing VMAs that have not been duplicated after this
-		 * point.
+		 * The entire maple tree has already been duplicated, but
+		 * replacing the vmas failed at mpnt (which could be NULL if
+		 * all were allocated but the last vma was not fully set up. Use
+		 * the start address of the failure point to clean up the half
+		 * initialized tree.
 		 */
-		if (mpnt) {
-			mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
-			mas_store(&vmi.mas, XA_ZERO_ENTRY);
-			/* Avoid OOM iterating a broken tree */
-			set_bit(MMF_OOM_SKIP, &mm->flags);
+		if (!mm->map_count) {
+			/* zero vmas were written to the new tree. */
+			max = 0;
+		} else if (mpnt) {
+			/* mid-tree failure */
+			max = mpnt->vm_start;
+		} else {
+			/* All vmas were written to the new tree */
+			max = ULONG_MAX;
 		}
+
+		/* Hide mm from oom killer because the memory is being freed */
+		set_bit(MMF_OOM_SKIP, &mm->flags);
+		if (max) {
+			vma_iter_set(&vmi, 0);
+			tmp = vma_next(&vmi);
+			flush_cache_mm(mm);
+			unmap_region(&vmi.mas, tmp, 0, max, max, NULL, NULL);
+			charge = tear_down_vmas(mm, &vmi, tmp, max);
+			vm_unacct_memory(charge);
+		}
+		__mt_destroy(&mm->mm_mt);
 		/*
 		 * The mm_struct is going to exit, but the locks will be dropped
 		 * first.  Set the mm_struct as unstable is advisable as it is
-- 
2.47.2
Re: [RFC PATCH 6/6] mm: Change dup_mmap() recovery
Posted by Lorenzo Stoakes 1 month, 2 weeks ago
On Fri, Aug 15, 2025 at 03:10:31PM -0400, Liam R. Howlett wrote:
> When the dup_mmap() fails during the vma duplication or setup, don't
> write the XA_ZERO entry in the vma tree.  Instead, destroy the tree and
> free the new resources, leaving an empty vma tree.

Yesss like it!

>
> Using XA_ZERO introduced races where the vma could be found between
> dup_mmap() dropping all locks and exit_mmap() taking the locks.  The
> race can occur because the mm can be reached through the other trees
> via successfully copied vmas and other methods such as the swapoff code.

Yeah god.

>
> XA_ZERO was marking the location to stop vma removal and pagetable
> freeing.  The newly created arguments to the unmap_vmas() and
> free_pgtables() serve this function.

Nice.

>
> Replacing the XA_ZERO entry use with the new argument list also means
> the checks for xa_is_zero() are no longer necessary so these are also
> removed.
>
> Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>

Ah finally the 'action' patch :)

Obviously my review is on the basis that you fixup the rebase etc.

This broadly looks right but some various nits etc. below and will have
another look through on new revision - this whole area is pretty crazy!

> ---
>  mm/memory.c |  6 +-----
>  mm/mmap.c   | 39 ++++++++++++++++++++++++++++-----------
>  2 files changed, 29 insertions(+), 16 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index 3346514562bba..8cd58f171bae4 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -387,8 +387,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
>  		 * be 0.  This will underflow and is okay.
>  		 */
>  		next = mas_find(mas, tree_max - 1);
> -		if (unlikely(xa_is_zero(next)))
> -			next = NULL;
>
>  		/*
>  		 * Hide vma from rmap and truncate_pagecache before freeing
> @@ -407,8 +405,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
>  		while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
>  			vma = next;
>  			next = mas_find(mas, tree_max - 1);
> -			if (unlikely(xa_is_zero(next)))
> -				next = NULL;
>  			if (mm_wr_locked)
>  				vma_start_write(vma);
>  			unlink_anon_vmas(vma);
> @@ -1978,7 +1974,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
>  				 mm_wr_locked);
>  		hugetlb_zap_end(vma, &details);
>  		vma = mas_find(mas, tree_end - 1);
> -	} while (vma && likely(!xa_is_zero(vma)));
> +	} while (vma);
>  	mmu_notifier_invalidate_range_end(&range);
>  }
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index eba2bc81bc749..5acc0b5f8c14a 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1288,7 +1288,7 @@ void exit_mmap(struct mm_struct *mm)
>  	arch_exit_mmap(mm);
>
>  	vma = vma_next(&vmi);
> -	if (!vma || unlikely(xa_is_zero(vma))) {
> +	if (!vma) {
>  		/* Can happen if dup_mmap() received an OOM */
>  		mmap_read_unlock(mm);
>  		mmap_write_lock(mm);
> @@ -1858,20 +1858,37 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
>  		ksm_fork(mm, oldmm);
>  		khugepaged_fork(mm, oldmm);
>  	} else {
> +		unsigned long max;
>
>  		/*
> -		 * The entire maple tree has already been duplicated. If the
> -		 * mmap duplication fails, mark the failure point with
> -		 * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
> -		 * stop releasing VMAs that have not been duplicated after this
> -		 * point.
> +		 * The entire maple tree has already been duplicated, but
> +		 * replacing the vmas failed at mpnt (which could be NULL if
> +		 * all were allocated but the last vma was not fully set up. Use

Missing ')'.

> +		 * the start address of the failure point to clean up the half
> +		 * initialized tree.

NIT: Is 'half' correct? Maybe 'partially'?

>  		 */
> -		if (mpnt) {
> -			mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
> -			mas_store(&vmi.mas, XA_ZERO_ENTRY);
> -			/* Avoid OOM iterating a broken tree */
> -			set_bit(MMF_OOM_SKIP, &mm->flags);
> +		if (!mm->map_count) {
> +			/* zero vmas were written to the new tree. */
> +			max = 0;

OK I guess this then will result in the intentional underflow thing, maybe
worth mentioning?

> +		} else if (mpnt) {
> +			/* mid-tree failure */

Partial?

> +			max = mpnt->vm_start;
> +		} else {
> +			/* All vmas were written to the new tree */
> +			max = ULONG_MAX;
>  		}
> +
> +		/* Hide mm from oom killer because the memory is being freed */
> +		set_bit(MMF_OOM_SKIP, &mm->flags);

Obv. update to mm_flags_set(MMF_OOM_SKIP, mm);

> +		if (max) {
> +			vma_iter_set(&vmi, 0);
> +			tmp = vma_next(&vmi);
> +			flush_cache_mm(mm);
> +			unmap_region(&vmi.mas, tmp, 0, max, max, NULL, NULL);

(An aside for the unmap_region() impl, maybe let's name the pg_max as
tree_max to make it consistent across both?)

Hm we could still use the mmap struct here if we put in vma.h. Just have to
set vmi, obv prev, next NULL.

So:

	struct mmap_state map {
		.vmi = &vmi,
	}

	unmap_region(&map, tmp, 0, max, max);

Possibly overkill and hopefully stack ok but makes other callers less
horrid.

Maybe also good to add a comment spelling out what each of these params do.


> +			charge = tear_down_vmas(mm, &vmi, tmp, max);
> +			vm_unacct_memory(charge);
> +		}
> +		__mt_destroy(&mm->mm_mt);
>  		/*
>  		 * The mm_struct is going to exit, but the locks will be dropped
>  		 * first.  Set the mm_struct as unstable is advisable as it is
> --
> 2.47.2
>
Re: [RFC PATCH 6/6] mm: Change dup_mmap() recovery
Posted by Liam R. Howlett 1 month ago
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250819 16:34]:
> On Fri, Aug 15, 2025 at 03:10:31PM -0400, Liam R. Howlett wrote:
> > When the dup_mmap() fails during the vma duplication or setup, don't
> > write the XA_ZERO entry in the vma tree.  Instead, destroy the tree and
> > free the new resources, leaving an empty vma tree.
> 
> Yesss like it!
> 
> >
> > Using XA_ZERO introduced races where the vma could be found between
> > dup_mmap() dropping all locks and exit_mmap() taking the locks.  The
> > race can occur because the mm can be reached through the other trees
> > via successfully copied vmas and other methods such as the swapoff code.
> 
> Yeah god.
> 
> >
> > XA_ZERO was marking the location to stop vma removal and pagetable
> > freeing.  The newly created arguments to the unmap_vmas() and
> > free_pgtables() serve this function.
> 
> Nice.
> 
> >
> > Replacing the XA_ZERO entry use with the new argument list also means
> > the checks for xa_is_zero() are no longer necessary so these are also
> > removed.
> >
> > Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
> 
> Ah finally the 'action' patch :)
> 
> Obviously my review is on the basis that you fixup the rebase etc.
> 
> This broadly looks right but some various nits etc. below and will have
> another look through on new revision - this whole area is pretty crazy!
> 
> > ---
> >  mm/memory.c |  6 +-----
> >  mm/mmap.c   | 39 ++++++++++++++++++++++++++++-----------
> >  2 files changed, 29 insertions(+), 16 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 3346514562bba..8cd58f171bae4 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -387,8 +387,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> >  		 * be 0.  This will underflow and is okay.
> >  		 */
> >  		next = mas_find(mas, tree_max - 1);
> > -		if (unlikely(xa_is_zero(next)))
> > -			next = NULL;
> >
> >  		/*
> >  		 * Hide vma from rmap and truncate_pagecache before freeing
> > @@ -407,8 +405,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> >  		while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
> >  			vma = next;
> >  			next = mas_find(mas, tree_max - 1);
> > -			if (unlikely(xa_is_zero(next)))
> > -				next = NULL;
> >  			if (mm_wr_locked)
> >  				vma_start_write(vma);
> >  			unlink_anon_vmas(vma);
> > @@ -1978,7 +1974,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
> >  				 mm_wr_locked);
> >  		hugetlb_zap_end(vma, &details);
> >  		vma = mas_find(mas, tree_end - 1);
> > -	} while (vma && likely(!xa_is_zero(vma)));
> > +	} while (vma);
> >  	mmu_notifier_invalidate_range_end(&range);
> >  }
> >
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index eba2bc81bc749..5acc0b5f8c14a 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1288,7 +1288,7 @@ void exit_mmap(struct mm_struct *mm)
> >  	arch_exit_mmap(mm);
> >
> >  	vma = vma_next(&vmi);
> > -	if (!vma || unlikely(xa_is_zero(vma))) {
> > +	if (!vma) {
> >  		/* Can happen if dup_mmap() received an OOM */
> >  		mmap_read_unlock(mm);
> >  		mmap_write_lock(mm);
> > @@ -1858,20 +1858,37 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
> >  		ksm_fork(mm, oldmm);
> >  		khugepaged_fork(mm, oldmm);
> >  	} else {
> > +		unsigned long max;
> >
> >  		/*
> > -		 * The entire maple tree has already been duplicated. If the
> > -		 * mmap duplication fails, mark the failure point with
> > -		 * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
> > -		 * stop releasing VMAs that have not been duplicated after this
> > -		 * point.
> > +		 * The entire maple tree has already been duplicated, but
> > +		 * replacing the vmas failed at mpnt (which could be NULL if
> > +		 * all were allocated but the last vma was not fully set up. Use
> 
> Missing ')'.

Thanks.

> 
> > +		 * the start address of the failure point to clean up the half
> > +		 * initialized tree.
> 
> NIT: Is 'half' correct? Maybe 'partially'?

Thanks.

> 
> >  		 */
> > -		if (mpnt) {
> > -			mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
> > -			mas_store(&vmi.mas, XA_ZERO_ENTRY);
> > -			/* Avoid OOM iterating a broken tree */
> > -			set_bit(MMF_OOM_SKIP, &mm->flags);
> > +		if (!mm->map_count) {
> > +			/* zero vmas were written to the new tree. */
> > +			max = 0;
> 
> OK I guess this then will result in the intentional underflow thing, maybe
> worth mentioning?

No, nothing will happen as the if (max) will evaluate as false.

> 
> > +		} else if (mpnt) {
> > +			/* mid-tree failure */
> 
> Partial?

Right, thanks.

> 
> > +			max = mpnt->vm_start;
> > +		} else {
> > +			/* All vmas were written to the new tree */
> > +			max = ULONG_MAX;
> >  		}
> > +
> > +		/* Hide mm from oom killer because the memory is being freed */
> > +		set_bit(MMF_OOM_SKIP, &mm->flags);
> 
> Obv. update to mm_flags_set(MMF_OOM_SKIP, mm);

Right, happened on rebase.

> 
> > +		if (max) {
> > +			vma_iter_set(&vmi, 0);
> > +			tmp = vma_next(&vmi);
> > +			flush_cache_mm(mm);
> > +			unmap_region(&vmi.mas, tmp, 0, max, max, NULL, NULL);
> 
> (An aside for the unmap_region() impl, maybe let's name the pg_max as
> tree_max to make it consistent across both?)
> 
> Hm we could still use the mmap struct here if we put in vma.h. Just have to
> set vmi, obv prev, next NULL.
> 
> So:
> 
> 	struct mmap_state map {
> 		.vmi = &vmi,
> 	}
> 
> 	unmap_region(&map, tmp, 0, max, max);
> 
> Possibly overkill and hopefully stack ok but makes other callers less
> horrid.
> 
> Maybe also good to add a comment spelling out what each of these params do.

So I've tried to do this cleanly but it hasn't worked out.  I think I'll
add another struct to clean this up in another patch on this series.

Thanks,
Liam
Re: [RFC PATCH 6/6] mm: Change dup_mmap() recovery
Posted by Lorenzo Stoakes 4 weeks, 1 day ago
On Wed, Sep 03, 2025 at 08:13:45PM -0400, Liam R. Howlett wrote:
> * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250819 16:34]:
> > On Fri, Aug 15, 2025 at 03:10:31PM -0400, Liam R. Howlett wrote:
> > > When the dup_mmap() fails during the vma duplication or setup, don't
> > > write the XA_ZERO entry in the vma tree.  Instead, destroy the tree and
> > > free the new resources, leaving an empty vma tree.
> >
> > Yesss like it!
> >
> > >
> > > Using XA_ZERO introduced races where the vma could be found between
> > > dup_mmap() dropping all locks and exit_mmap() taking the locks.  The
> > > race can occur because the mm can be reached through the other trees
> > > via successfully copied vmas and other methods such as the swapoff code.
> >
> > Yeah god.
> >
> > >
> > > XA_ZERO was marking the location to stop vma removal and pagetable
> > > freeing.  The newly created arguments to the unmap_vmas() and
> > > free_pgtables() serve this function.
> >
> > Nice.
> >
> > >
> > > Replacing the XA_ZERO entry use with the new argument list also means
> > > the checks for xa_is_zero() are no longer necessary so these are also
> > > removed.
> > >
> > > Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
> >
> > Ah finally the 'action' patch :)
> >
> > Obviously my review is on the basis that you fixup the rebase etc.
> >
> > This broadly looks right but some various nits etc. below and will have
> > another look through on new revision - this whole area is pretty crazy!
> >
> > > ---
> > >  mm/memory.c |  6 +-----
> > >  mm/mmap.c   | 39 ++++++++++++++++++++++++++++-----------
> > >  2 files changed, 29 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index 3346514562bba..8cd58f171bae4 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -387,8 +387,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> > >  		 * be 0.  This will underflow and is okay.
> > >  		 */
> > >  		next = mas_find(mas, tree_max - 1);
> > > -		if (unlikely(xa_is_zero(next)))
> > > -			next = NULL;
> > >
> > >  		/*
> > >  		 * Hide vma from rmap and truncate_pagecache before freeing
> > > @@ -407,8 +405,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
> > >  		while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
> > >  			vma = next;
> > >  			next = mas_find(mas, tree_max - 1);
> > > -			if (unlikely(xa_is_zero(next)))
> > > -				next = NULL;
> > >  			if (mm_wr_locked)
> > >  				vma_start_write(vma);
> > >  			unlink_anon_vmas(vma);
> > > @@ -1978,7 +1974,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
> > >  				 mm_wr_locked);
> > >  		hugetlb_zap_end(vma, &details);
> > >  		vma = mas_find(mas, tree_end - 1);
> > > -	} while (vma && likely(!xa_is_zero(vma)));
> > > +	} while (vma);
> > >  	mmu_notifier_invalidate_range_end(&range);
> > >  }
> > >
> > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > index eba2bc81bc749..5acc0b5f8c14a 100644
> > > --- a/mm/mmap.c
> > > +++ b/mm/mmap.c
> > > @@ -1288,7 +1288,7 @@ void exit_mmap(struct mm_struct *mm)
> > >  	arch_exit_mmap(mm);
> > >
> > >  	vma = vma_next(&vmi);
> > > -	if (!vma || unlikely(xa_is_zero(vma))) {
> > > +	if (!vma) {
> > >  		/* Can happen if dup_mmap() received an OOM */
> > >  		mmap_read_unlock(mm);
> > >  		mmap_write_lock(mm);
> > > @@ -1858,20 +1858,37 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
> > >  		ksm_fork(mm, oldmm);
> > >  		khugepaged_fork(mm, oldmm);
> > >  	} else {
> > > +		unsigned long max;
> > >
> > >  		/*
> > > -		 * The entire maple tree has already been duplicated. If the
> > > -		 * mmap duplication fails, mark the failure point with
> > > -		 * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
> > > -		 * stop releasing VMAs that have not been duplicated after this
> > > -		 * point.
> > > +		 * The entire maple tree has already been duplicated, but
> > > +		 * replacing the vmas failed at mpnt (which could be NULL if
> > > +		 * all were allocated but the last vma was not fully set up. Use
> >
> > Missing ')'.
>
> Thanks.
>
> >
> > > +		 * the start address of the failure point to clean up the half
> > > +		 * initialized tree.
> >
> > NIT: Is 'half' correct? Maybe 'partially'?
>
> Thanks.
>
> >
> > >  		 */
> > > -		if (mpnt) {
> > > -			mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
> > > -			mas_store(&vmi.mas, XA_ZERO_ENTRY);
> > > -			/* Avoid OOM iterating a broken tree */
> > > -			set_bit(MMF_OOM_SKIP, &mm->flags);
> > > +		if (!mm->map_count) {
> > > +			/* zero vmas were written to the new tree. */
> > > +			max = 0;
> >
> > OK I guess this then will result in the intentional underflow thing, maybe
> > worth mentioning?
>
> No, nothing will happen as the if (max) will evaluate as false.

Ahhh ok. This makes more sense :)

>
> >
> > > +		} else if (mpnt) {
> > > +			/* mid-tree failure */
> >
> > Partial?
>
> Right, thanks.
>
> >
> > > +			max = mpnt->vm_start;
> > > +		} else {
> > > +			/* All vmas were written to the new tree */
> > > +			max = ULONG_MAX;
> > >  		}
> > > +
> > > +		/* Hide mm from oom killer because the memory is being freed */
> > > +		set_bit(MMF_OOM_SKIP, &mm->flags);
> >
> > Obv. update to mm_flags_set(MMF_OOM_SKIP, mm);
>
> Right, happened on rebase.
>
> >
> > > +		if (max) {
> > > +			vma_iter_set(&vmi, 0);
> > > +			tmp = vma_next(&vmi);
> > > +			flush_cache_mm(mm);
> > > +			unmap_region(&vmi.mas, tmp, 0, max, max, NULL, NULL);
> >
> > (An aside for the unmap_region() impl, maybe let's name the pg_max as
> > tree_max to make it consistent across both?)
> >
> > Hm we could still use the mmap struct here if we put in vma.h. Just have to
> > set vmi, obv prev, next NULL.
> >
> > So:
> >
> > 	struct mmap_state map {
> > 		.vmi = &vmi,
> > 	}
> >
> > 	unmap_region(&map, tmp, 0, max, max);
> >
> > Possibly overkill and hopefully stack ok but makes other callers less
> > horrid.
> >
> > Maybe also good to add a comment spelling out what each of these params do.
>
> So I've tried to do this cleanly but it hasn't worked out.  I think I'll
> add another struct to clean this up in another patch on this series.

Moar helper structs! Yes yes yes! ;)

>
> Thanks,
> Liam

Cheers, Lorenzo
Re: [RFC PATCH 6/6] mm: Change dup_mmap() recovery
Posted by Lorenzo Stoakes 1 month, 2 weeks ago
It seems your series clashes with my mm flags change, and this patch is a
victim, it won't apply against mm-new at all :(
Re: [RFC PATCH 6/6] mm: Change dup_mmap() recovery
Posted by Lorenzo Stoakes 1 month, 2 weeks ago
On Mon, Aug 18, 2025 at 04:12:59PM +0100, Lorenzo Stoakes wrote:
> It seems your series clashes with my mm flags change, and this patch is a
> victim, it won't apply against mm-new at all :(

I rebased it/fixed mm flags stuff locally (manually) so I could test, so I
thought - why not actually share that here to make your life easier :P

It's attached, if I didn't mess up with neomutt (big IF)

Cheers, Lorenzo
From 16b7f9d7042de8cb7d6f86b815d3075cedcf1132 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Mon, 18 Aug 2025 16:19:46 +0100
Subject: [PATCH RFC 6/6] mm: change dup_mmap() recovery

When the dup_mmap() fails during the vma duplication or setup, don't
write the XA_ZERO entry in the vma tree.  Instead, destroy the tree and
free the new resources, leaving an empty vma tree.

Using XA_ZERO introduced races where the vma could be found between
dup_mmap() dropping all locks and exit_mmap() taking the locks.  The
race can occur because the mm can be reached through the other trees
via successfully copied vmas and other methods such as the swapoff code.

XA_ZERO was marking the location to stop vma removal and pagetable
freeing.  The newly created arguments to the unmap_vmas() and
free_pgtables() serve this function.

Replacing the XA_ZERO entry use with the new argument list also means
the checks for xa_is_zero() are no longer necessary so these are also
removed.

Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
---
 mm/memory.c |  6 +-----
 mm/mmap.c   | 39 ++++++++++++++++++++++++++++-----------
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 9f584c5e4f43..218b496dc9ad 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -387,8 +387,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 		 * be 0.  This will underflow and is okay.
 		 */
 		next = mas_find(mas, tree_max - 1);
-		if (unlikely(xa_is_zero(next)))
-			next = NULL;

 		/*
 		 * Hide vma from rmap and truncate_pagecache before freeing
@@ -407,8 +405,6 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 		while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
 			vma = next;
 			next = mas_find(mas, tree_max - 1);
-			if (unlikely(xa_is_zero(next)))
-				next = NULL;
 			if (mm_wr_locked)
 				vma_start_write(vma);
 			unlink_anon_vmas(vma);
@@ -2106,7 +2102,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
 				 mm_wr_locked);
 		hugetlb_zap_end(vma, &details);
 		vma = mas_find(mas, tree_end - 1);
-	} while (vma && likely(!xa_is_zero(vma)));
+	} while (vma);
 	mmu_notifier_invalidate_range_end(&range);
 }

diff --git a/mm/mmap.c b/mm/mmap.c
index b49df82817ba..ecfeb1987156 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1288,7 +1288,7 @@ void exit_mmap(struct mm_struct *mm)
 	arch_exit_mmap(mm);

 	vma = vma_next(&vmi);
-	if (!vma || unlikely(xa_is_zero(vma))) {
+	if (!vma) {
 		/* Can happen if dup_mmap() received an OOM */
 		mmap_read_unlock(mm);
 		mmap_write_lock(mm);
@@ -1858,20 +1858,37 @@ __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		ksm_fork(mm, oldmm);
 		khugepaged_fork(mm, oldmm);
 	} else {
+		unsigned long max;

 		/*
-		 * The entire maple tree has already been duplicated. If the
-		 * mmap duplication fails, mark the failure point with
-		 * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
-		 * stop releasing VMAs that have not been duplicated after this
-		 * point.
+		 * The entire maple tree has already been duplicated, but
+		 * replacing the vmas failed at mpnt (which could be NULL if
+		 * all were allocated but the last vma was not fully set up. Use
+		 * the start address of the failure point to clean up the half
+		 * initialized tree.
 		 */
-		if (mpnt) {
-			mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
-			mas_store(&vmi.mas, XA_ZERO_ENTRY);
-			/* Avoid OOM iterating a broken tree */
-			mm_flags_set(MMF_OOM_SKIP, mm);
+		if (!mm->map_count) {
+			/* zero vmas were written to the new tree. */
+			max = 0;
+		} else if (mpnt) {
+			/* mid-tree failure */
+			max = mpnt->vm_start;
+		} else {
+			/* All vmas were written to the new tree */
+			max = ULONG_MAX;
 		}
+
+		/* Hide mm from oom killer because the memory is being freed */
+		mm_flags_set(MMF_OOM_SKIP, mm);
+		if (max) {
+			vma_iter_set(&vmi, 0);
+			tmp = vma_next(&vmi);
+			flush_cache_mm(mm);
+			unmap_region(&vmi.mas, tmp, 0, max, max, NULL, NULL);
+			charge = tear_down_vmas(mm, &vmi, tmp, max);
+			vm_unacct_memory(charge);
+		}
+		__mt_destroy(&mm->mm_mt);
 		/*
 		 * The mm_struct is going to exit, but the locks will be dropped
 		 * first.  Set the mm_struct as unstable is advisable as it is
--
2.50.1