[PATCH 8/8] mm/rmap: separate out fork-only logic on anon_vma_clone()

Lorenzo Stoakes posted 8 patches 1 month, 3 weeks ago
There is a newer version of this series
[PATCH 8/8] mm/rmap: separate out fork-only logic on anon_vma_clone()
Posted by Lorenzo Stoakes 1 month, 3 weeks ago
Specify which operation is being performed to anon_vma_clone(), which
allows us to do checks specific to each operation type, as well as to
separate out and make clear that the anon_vma reuse logic is absolutely
specific to fork only.

This opens the door to further refactorings and refinements later as we
have more information to work with.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
 mm/internal.h                    | 11 +++++-
 mm/rmap.c                        | 67 ++++++++++++++++++++++----------
 mm/vma.c                         |  6 +--
 tools/testing/vma/vma_internal.h | 11 +++++-
 4 files changed, 69 insertions(+), 26 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 469d4ef1ccc5..b4d4bca0f9a7 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -244,7 +244,16 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
 
 struct anon_vma *folio_get_anon_vma(const struct folio *folio);
 
-int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src);
+/* Operations which modify VMAs. */
+enum vma_operation {
+	VMA_OP_SPLIT,
+	VMA_OP_MERGE_UNFAULTED,
+	VMA_OP_REMAP,
+	VMA_OP_FORK,
+};
+
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
+	enum vma_operation operation);
 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma);
 int  __anon_vma_prepare(struct vm_area_struct *vma);
 void unlink_anon_vmas(struct vm_area_struct *vma);
diff --git a/mm/rmap.c b/mm/rmap.c
index de9de6d71c23..f08e6bc57379 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -232,12 +232,13 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
 }
 
 static void check_anon_vma_clone(struct vm_area_struct *dst,
-				 struct vm_area_struct *src)
+				 struct vm_area_struct *src,
+				 enum vma_operation operation)
 {
 	/* The write lock must be held. */
 	mmap_assert_write_locked(src->vm_mm);
-	/* If not a fork (implied by dst->anon_vma) then must be on same mm. */
-	VM_WARN_ON_ONCE(dst->anon_vma && dst->vm_mm != src->vm_mm);
+	/* If not a fork then must be on same mm. */
+	VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm);
 
 	/* No source anon_vma is a no-op. */
 	VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain));
@@ -249,6 +250,35 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
 	 * must be the same across dst and src.
 	 */
 	VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma);
+
+	/* For the anon_vma to be compatible, it can only be singular. */
+	VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED &&
+			!list_is_singular(&src->anon_vma_chain));
+#ifdef CONFIG_PER_VMA_LOCK
+	/* Only merging an unfaulted VMA leaves the destination attached. */
+	VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED &&
+			vma_is_attached(dst));
+#endif
+}
+
+static void find_reusable_anon_vma(struct vm_area_struct *vma,
+				   struct anon_vma *anon_vma)
+{
+	/* If already populated, nothing to do.*/
+	if (vma->anon_vma)
+		return;
+
+	/*
+	 * We reuse an anon_vma if any linking VMAs were unmapped and it has
+	 * only a single child at most.
+	 */
+	if (anon_vma->num_active_vmas > 0)
+		return;
+	if (anon_vma->num_children > 1)
+		return;
+
+	vma->anon_vma = anon_vma;
+	anon_vma->num_active_vmas++;
 }
 
 /**
@@ -256,6 +286,7 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
  * all of the anon_vma objects contained within @src anon_vma_chain's.
  * @dst: The destination VMA with an empty anon_vma_chain.
  * @src: The source VMA we wish to duplicate.
+ * @operation: The type of operation which resulted in the clone.
  *
  * This is the heart of the VMA side of the anon_vma implementation - we invoke
  * this function whenever we need to set up a new VMA's anon_vma state.
@@ -278,14 +309,16 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
  *
  * Returns: 0 on success, -ENOMEM on failure.
  */
-int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
+		   enum vma_operation operation)
 {
 	struct anon_vma_chain *avc, *pavc;
+	struct anon_vma *active_anon_vma = src->anon_vma;
 
-	if (!src->anon_vma)
+	if (!active_anon_vma)
 		return 0;
 
-	check_anon_vma_clone(dst, src);
+	check_anon_vma_clone(dst, src, operation);
 
 	/*
 	 * Allocate AVCs. We don't need an anon_vma lock for this as we
@@ -309,22 +342,14 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 		struct anon_vma *anon_vma = avc->anon_vma;
 
 		anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
-
-		/*
-		 * Reuse existing anon_vma if it has no vma and only one
-		 * anon_vma child.
-		 *
-		 * Root anon_vma is never reused:
-		 * it has self-parent reference and at least one child.
-		 */
-		if (!dst->anon_vma && src->anon_vma &&
-		    anon_vma->num_children < 2 &&
-		    anon_vma->num_active_vmas == 0)
-			dst->anon_vma = anon_vma;
+		if (operation == VMA_OP_FORK)
+			find_reusable_anon_vma(dst, anon_vma);
 	}
-	if (dst->anon_vma)
+
+	if (operation != VMA_OP_FORK)
 		dst->anon_vma->num_active_vmas++;
-	anon_vma_unlock_write(src->anon_vma);
+
+	anon_vma_unlock_write(active_anon_vma);
 	return 0;
 
  enomem_failure:
@@ -361,7 +386,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
 	 * First, attach the new VMA to the parent VMA's anon_vmas,
 	 * so rmap can find non-COWed pages in child processes.
 	 */
-	error = anon_vma_clone(vma, pvma);
+	error = anon_vma_clone(vma, pvma, VMA_OP_FORK);
 	if (error)
 		return error;
 
diff --git a/mm/vma.c b/mm/vma.c
index feb4bbd3b259..e297c3a94133 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -525,7 +525,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
 	if (err)
 		goto out_free_vmi;
 
-	err = anon_vma_clone(new, vma);
+	err = anon_vma_clone(new, vma, VMA_OP_SPLIT);
 	if (err)
 		goto out_free_mpol;
 
@@ -623,7 +623,7 @@ static int dup_anon_vma(struct vm_area_struct *dst,
 
 		vma_assert_write_locked(dst);
 		dst->anon_vma = src->anon_vma;
-		ret = anon_vma_clone(dst, src);
+		ret = anon_vma_clone(dst, src, VMA_OP_MERGE_UNFAULTED);
 		if (ret)
 			return ret;
 
@@ -1862,7 +1862,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		vma_set_range(new_vma, addr, addr + len, pgoff);
 		if (vma_dup_policy(vma, new_vma))
 			goto out_free_vma;
-		if (anon_vma_clone(new_vma, vma))
+		if (anon_vma_clone(new_vma, vma, VMA_OP_REMAP))
 			goto out_free_mempol;
 		if (new_vma->vm_file)
 			get_file(new_vma->vm_file);
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 93e5792306d9..7fa56dcc53a6 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -600,6 +600,14 @@ struct mmap_action {
 	bool hide_from_rmap_until_complete :1;
 };
 
+/* Operations which modify VMAs. */
+enum vma_operation {
+	VMA_OP_SPLIT,
+	VMA_OP_MERGE_UNFAULTED,
+	VMA_OP_REMAP,
+	VMA_OP_FORK,
+};
+
 /*
  * Describes a VMA that is about to be mmap()'ed. Drivers may choose to
  * manipulate mutable fields which will cause those fields to be updated in the
@@ -1157,7 +1165,8 @@ static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_stru
 	return 0;
 }
 
-static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
+static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
+				 enum vma_operation operation)
 {
 	/* For testing purposes. We indicate that an anon_vma has been cloned. */
 	if (src->anon_vma != NULL) {
-- 
2.52.0
Re: [PATCH 8/8] mm/rmap: separate out fork-only logic on anon_vma_clone()
Posted by Suren Baghdasaryan 1 month, 1 week ago
On Wed, Dec 17, 2025 at 4:27 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> Specify which operation is being performed to anon_vma_clone(), which
> allows us to do checks specific to each operation type, as well as to
> separate out and make clear that the anon_vma reuse logic is absolutely
> specific to fork only.
>
> This opens the door to further refactorings and refinements later as we
> have more information to work with.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
>  mm/internal.h                    | 11 +++++-
>  mm/rmap.c                        | 67 ++++++++++++++++++++++----------
>  mm/vma.c                         |  6 +--
>  tools/testing/vma/vma_internal.h | 11 +++++-
>  4 files changed, 69 insertions(+), 26 deletions(-)
>
> diff --git a/mm/internal.h b/mm/internal.h
> index 469d4ef1ccc5..b4d4bca0f9a7 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -244,7 +244,16 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
>
>  struct anon_vma *folio_get_anon_vma(const struct folio *folio);
>
> -int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src);
> +/* Operations which modify VMAs. */
> +enum vma_operation {
> +       VMA_OP_SPLIT,
> +       VMA_OP_MERGE_UNFAULTED,
> +       VMA_OP_REMAP,
> +       VMA_OP_FORK,
> +};
> +
> +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
> +       enum vma_operation operation);
>  int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma);
>  int  __anon_vma_prepare(struct vm_area_struct *vma);
>  void unlink_anon_vmas(struct vm_area_struct *vma);
> diff --git a/mm/rmap.c b/mm/rmap.c
> index de9de6d71c23..f08e6bc57379 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -232,12 +232,13 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
>  }
>
>  static void check_anon_vma_clone(struct vm_area_struct *dst,
> -                                struct vm_area_struct *src)
> +                                struct vm_area_struct *src,
> +                                enum vma_operation operation)
>  {
>         /* The write lock must be held. */
>         mmap_assert_write_locked(src->vm_mm);
> -       /* If not a fork (implied by dst->anon_vma) then must be on same mm. */
> -       VM_WARN_ON_ONCE(dst->anon_vma && dst->vm_mm != src->vm_mm);
> +       /* If not a fork then must be on same mm. */
> +       VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm);
>
>         /* No source anon_vma is a no-op. */
>         VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain));
> @@ -249,6 +250,35 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
>          * must be the same across dst and src.
>          */
>         VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma);
> +
> +       /* For the anon_vma to be compatible, it can only be singular. */
> +       VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED &&
> +                       !list_is_singular(&src->anon_vma_chain));
> +#ifdef CONFIG_PER_VMA_LOCK
> +       /* Only merging an unfaulted VMA leaves the destination attached. */

Do you mean to say that for all other operations dst should not be yet
attached? If so, "leaves the destination attached" is a bit confusing
choice of words to indicate that attaching the VMA in other cases has
not yet happened. Maybe reword as: "For cases other than merging an
unfaulted VMA, the VMA should not be attached yet."

> +       VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED &&
> +                       vma_is_attached(dst));
> +#endif
> +}
> +
> +static void find_reusable_anon_vma(struct vm_area_struct *vma,
> +                                  struct anon_vma *anon_vma)

Naming is hard but a function that assigns vma->anon_vma and
increments num_active_vmas should not be called
"find_reusable_anon_vma". I would suggest keeping the name but making
it return the anon_vma (or NULL if it's not reusable), letting the
caller do the assignment and the increment.

> +{
> +       /* If already populated, nothing to do.*/
> +       if (vma->anon_vma)
> +               return;
> +
> +       /*
> +        * We reuse an anon_vma if any linking VMAs were unmapped and it has
> +        * only a single child at most.
> +        */
> +       if (anon_vma->num_active_vmas > 0)
> +               return;
> +       if (anon_vma->num_children > 1)
> +               return;
> +
> +       vma->anon_vma = anon_vma;
> +       anon_vma->num_active_vmas++;

You moved num_active_vmas++ here to fix the accounting later... Sneaky
but correct AFAICT :)

>  }
>
>  /**
> @@ -256,6 +286,7 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
>   * all of the anon_vma objects contained within @src anon_vma_chain's.
>   * @dst: The destination VMA with an empty anon_vma_chain.
>   * @src: The source VMA we wish to duplicate.
> + * @operation: The type of operation which resulted in the clone.
>   *
>   * This is the heart of the VMA side of the anon_vma implementation - we invoke
>   * this function whenever we need to set up a new VMA's anon_vma state.
> @@ -278,14 +309,16 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
>   *
>   * Returns: 0 on success, -ENOMEM on failure.
>   */
> -int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
> +                  enum vma_operation operation)
>  {
>         struct anon_vma_chain *avc, *pavc;
> +       struct anon_vma *active_anon_vma = src->anon_vma;
>
> -       if (!src->anon_vma)
> +       if (!active_anon_vma)
>                 return 0;
>
> -       check_anon_vma_clone(dst, src);
> +       check_anon_vma_clone(dst, src, operation);
>
>         /*
>          * Allocate AVCs. We don't need an anon_vma lock for this as we
> @@ -309,22 +342,14 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
>                 struct anon_vma *anon_vma = avc->anon_vma;
>
>                 anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
> -
> -               /*
> -                * Reuse existing anon_vma if it has no vma and only one
> -                * anon_vma child.
> -                *
> -                * Root anon_vma is never reused:
> -                * it has self-parent reference and at least one child.
> -                */
> -               if (!dst->anon_vma && src->anon_vma &&
> -                   anon_vma->num_children < 2 &&
> -                   anon_vma->num_active_vmas == 0)
> -                       dst->anon_vma = anon_vma;
> +               if (operation == VMA_OP_FORK)
> +                       find_reusable_anon_vma(dst, anon_vma);
>         }
> -       if (dst->anon_vma)
> +
> +       if (operation != VMA_OP_FORK)
>                 dst->anon_vma->num_active_vmas++;
> -       anon_vma_unlock_write(src->anon_vma);
> +
> +       anon_vma_unlock_write(active_anon_vma);
>         return 0;
>
>   enomem_failure:
> @@ -361,7 +386,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
>          * First, attach the new VMA to the parent VMA's anon_vmas,
>          * so rmap can find non-COWed pages in child processes.
>          */
> -       error = anon_vma_clone(vma, pvma);
> +       error = anon_vma_clone(vma, pvma, VMA_OP_FORK);
>         if (error)
>                 return error;
>
> diff --git a/mm/vma.c b/mm/vma.c
> index feb4bbd3b259..e297c3a94133 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -525,7 +525,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
>         if (err)
>                 goto out_free_vmi;
>
> -       err = anon_vma_clone(new, vma);
> +       err = anon_vma_clone(new, vma, VMA_OP_SPLIT);
>         if (err)
>                 goto out_free_mpol;
>
> @@ -623,7 +623,7 @@ static int dup_anon_vma(struct vm_area_struct *dst,
>
>                 vma_assert_write_locked(dst);
>                 dst->anon_vma = src->anon_vma;
> -               ret = anon_vma_clone(dst, src);
> +               ret = anon_vma_clone(dst, src, VMA_OP_MERGE_UNFAULTED);
>                 if (ret)
>                         return ret;
>
> @@ -1862,7 +1862,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
>                 vma_set_range(new_vma, addr, addr + len, pgoff);
>                 if (vma_dup_policy(vma, new_vma))
>                         goto out_free_vma;
> -               if (anon_vma_clone(new_vma, vma))
> +               if (anon_vma_clone(new_vma, vma, VMA_OP_REMAP))
>                         goto out_free_mempol;
>                 if (new_vma->vm_file)
>                         get_file(new_vma->vm_file);
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index 93e5792306d9..7fa56dcc53a6 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -600,6 +600,14 @@ struct mmap_action {
>         bool hide_from_rmap_until_complete :1;
>  };
>
> +/* Operations which modify VMAs. */
> +enum vma_operation {
> +       VMA_OP_SPLIT,
> +       VMA_OP_MERGE_UNFAULTED,
> +       VMA_OP_REMAP,
> +       VMA_OP_FORK,
> +};
> +
>  /*
>   * Describes a VMA that is about to be mmap()'ed. Drivers may choose to
>   * manipulate mutable fields which will cause those fields to be updated in the
> @@ -1157,7 +1165,8 @@ static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_stru
>         return 0;
>  }
>
> -static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> +static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
> +                                enum vma_operation operation)
>  {
>         /* For testing purposes. We indicate that an anon_vma has been cloned. */
>         if (src->anon_vma != NULL) {
> --
> 2.52.0
>
Re: [PATCH 8/8] mm/rmap: separate out fork-only logic on anon_vma_clone()
Posted by Lorenzo Stoakes 1 month ago
On Tue, Dec 30, 2025 at 02:02:29PM -0800, Suren Baghdasaryan wrote:
> On Wed, Dec 17, 2025 at 4:27 AM Lorenzo Stoakes
> <lorenzo.stoakes@oracle.com> wrote:
> >
> > Specify which operation is being performed to anon_vma_clone(), which
> > allows us to do checks specific to each operation type, as well as to
> > separate out and make clear that the anon_vma reuse logic is absolutely
> > specific to fork only.
> >
> > This opens the door to further refactorings and refinements later as we
> > have more information to work with.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> >  mm/internal.h                    | 11 +++++-
> >  mm/rmap.c                        | 67 ++++++++++++++++++++++----------
> >  mm/vma.c                         |  6 +--
> >  tools/testing/vma/vma_internal.h | 11 +++++-
> >  4 files changed, 69 insertions(+), 26 deletions(-)
> >
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 469d4ef1ccc5..b4d4bca0f9a7 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -244,7 +244,16 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
> >
> >  struct anon_vma *folio_get_anon_vma(const struct folio *folio);
> >
> > -int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src);
> > +/* Operations which modify VMAs. */
> > +enum vma_operation {
> > +       VMA_OP_SPLIT,
> > +       VMA_OP_MERGE_UNFAULTED,
> > +       VMA_OP_REMAP,
> > +       VMA_OP_FORK,
> > +};
> > +
> > +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
> > +       enum vma_operation operation);
> >  int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma);
> >  int  __anon_vma_prepare(struct vm_area_struct *vma);
> >  void unlink_anon_vmas(struct vm_area_struct *vma);
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index de9de6d71c23..f08e6bc57379 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -232,12 +232,13 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
> >  }
> >
> >  static void check_anon_vma_clone(struct vm_area_struct *dst,
> > -                                struct vm_area_struct *src)
> > +                                struct vm_area_struct *src,
> > +                                enum vma_operation operation)
> >  {
> >         /* The write lock must be held. */
> >         mmap_assert_write_locked(src->vm_mm);
> > -       /* If not a fork (implied by dst->anon_vma) then must be on same mm. */
> > -       VM_WARN_ON_ONCE(dst->anon_vma && dst->vm_mm != src->vm_mm);
> > +       /* If not a fork then must be on same mm. */
> > +       VM_WARN_ON_ONCE(operation != VMA_OP_FORK && dst->vm_mm != src->vm_mm);
> >
> >         /* No source anon_vma is a no-op. */
> >         VM_WARN_ON_ONCE(!src->anon_vma && !list_empty(&src->anon_vma_chain));
> > @@ -249,6 +250,35 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
> >          * must be the same across dst and src.
> >          */
> >         VM_WARN_ON_ONCE(dst->anon_vma && dst->anon_vma != src->anon_vma);
> > +
> > +       /* For the anon_vma to be compatible, it can only be singular. */
> > +       VM_WARN_ON_ONCE(operation == VMA_OP_MERGE_UNFAULTED &&
> > +                       !list_is_singular(&src->anon_vma_chain));
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +       /* Only merging an unfaulted VMA leaves the destination attached. */
>
> Do you mean to say that for all other operations dst should not be yet
> attached? If so, "leaves the destination attached" is a bit confusing
> choice of words to indicate that attaching the VMA in other cases has
> not yet happened. Maybe reword as: "For cases other than merging an
> unfaulted VMA, the VMA should not be attached yet."
>
> > +       VM_WARN_ON_ONCE(operation != VMA_OP_MERGE_UNFAULTED &&
> > +                       vma_is_attached(dst));
> > +#endif
> > +}
> > +
> > +static void find_reusable_anon_vma(struct vm_area_struct *vma,
> > +                                  struct anon_vma *anon_vma)
>
> Naming is hard but a function that assigns vma->anon_vma and
> increments num_active_vmas should not be called
> "find_reusable_anon_vma". I would suggest keeping the name but making
> it return the anon_vma (or NULL if it's not reusable), letting the
> caller do the assignment and the increment.

I mean totally agree this name is bad actually :) good spot.

Since you might already have populated so we can't return NULL, so I think a
rename is better here actually -> try_to_reuse_anon_vma() which more accurately
describes what's going on and takes into account the fact it's doing a stats
update also!

>
> > +{
> > +       /* If already populated, nothing to do.*/
> > +       if (vma->anon_vma)
> > +               return;
> > +
> > +       /*
> > +        * We reuse an anon_vma if any linking VMAs were unmapped and it has
> > +        * only a single child at most.
> > +        */
> > +       if (anon_vma->num_active_vmas > 0)
> > +               return;
> > +       if (anon_vma->num_children > 1)
> > +               return;
> > +
> > +       vma->anon_vma = anon_vma;
> > +       anon_vma->num_active_vmas++;
>
> You moved num_active_vmas++ here to fix the accounting later... Sneaky
> but correct AFAICT :)

;)

I would say in my defence (perhaps unnecessarily, as being sneaky is a key
kernel dev skill imo ;) that it's no more sneaky than the code already
was. Probably :)

Really this is about making the situation clearer in anon_vma_clone(). We could
instead have:

	if (dst->anon_vma)
		dst->anon_vma->num_active_vmas++;

And with the assignment here, we cover off the fork + reuse case.

However this doesn't make it clear that in fact for all non-fork cases we are
_always_ incrementing this value, which is what I wanted to make super obvious.

One thing here that I think would help is to rename vma to dst just to
carry that across from anon_vma_clone() and to make it clear we're
checking/manipulating the destination VMA here.

>
> >  }
> >
> >  /**
> > @@ -256,6 +286,7 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
> >   * all of the anon_vma objects contained within @src anon_vma_chain's.
> >   * @dst: The destination VMA with an empty anon_vma_chain.
> >   * @src: The source VMA we wish to duplicate.
> > + * @operation: The type of operation which resulted in the clone.
> >   *
> >   * This is the heart of the VMA side of the anon_vma implementation - we invoke
> >   * this function whenever we need to set up a new VMA's anon_vma state.
> > @@ -278,14 +309,16 @@ static void check_anon_vma_clone(struct vm_area_struct *dst,
> >   *
> >   * Returns: 0 on success, -ENOMEM on failure.
> >   */
> > -int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> > +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
> > +                  enum vma_operation operation)
> >  {
> >         struct anon_vma_chain *avc, *pavc;
> > +       struct anon_vma *active_anon_vma = src->anon_vma;
> >
> > -       if (!src->anon_vma)
> > +       if (!active_anon_vma)
> >                 return 0;
> >
> > -       check_anon_vma_clone(dst, src);
> > +       check_anon_vma_clone(dst, src, operation);
> >
> >         /*
> >          * Allocate AVCs. We don't need an anon_vma lock for this as we
> > @@ -309,22 +342,14 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> >                 struct anon_vma *anon_vma = avc->anon_vma;
> >
> >                 anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
> > -
> > -               /*
> > -                * Reuse existing anon_vma if it has no vma and only one
> > -                * anon_vma child.
> > -                *
> > -                * Root anon_vma is never reused:
> > -                * it has self-parent reference and at least one child.
> > -                */
> > -               if (!dst->anon_vma && src->anon_vma &&
> > -                   anon_vma->num_children < 2 &&
> > -                   anon_vma->num_active_vmas == 0)
> > -                       dst->anon_vma = anon_vma;
> > +               if (operation == VMA_OP_FORK)
> > +                       find_reusable_anon_vma(dst, anon_vma);
> >         }
> > -       if (dst->anon_vma)
> > +
> > +       if (operation != VMA_OP_FORK)
> >                 dst->anon_vma->num_active_vmas++;
> > -       anon_vma_unlock_write(src->anon_vma);
> > +
> > +       anon_vma_unlock_write(active_anon_vma);
> >         return 0;
> >
> >   enomem_failure:
> > @@ -361,7 +386,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
> >          * First, attach the new VMA to the parent VMA's anon_vmas,
> >          * so rmap can find non-COWed pages in child processes.
> >          */
> > -       error = anon_vma_clone(vma, pvma);
> > +       error = anon_vma_clone(vma, pvma, VMA_OP_FORK);
> >         if (error)
> >                 return error;
> >
> > diff --git a/mm/vma.c b/mm/vma.c
> > index feb4bbd3b259..e297c3a94133 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -525,7 +525,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >         if (err)
> >                 goto out_free_vmi;
> >
> > -       err = anon_vma_clone(new, vma);
> > +       err = anon_vma_clone(new, vma, VMA_OP_SPLIT);
> >         if (err)
> >                 goto out_free_mpol;
> >
> > @@ -623,7 +623,7 @@ static int dup_anon_vma(struct vm_area_struct *dst,
> >
> >                 vma_assert_write_locked(dst);
> >                 dst->anon_vma = src->anon_vma;
> > -               ret = anon_vma_clone(dst, src);
> > +               ret = anon_vma_clone(dst, src, VMA_OP_MERGE_UNFAULTED);
> >                 if (ret)
> >                         return ret;
> >
> > @@ -1862,7 +1862,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> >                 vma_set_range(new_vma, addr, addr + len, pgoff);
> >                 if (vma_dup_policy(vma, new_vma))
> >                         goto out_free_vma;
> > -               if (anon_vma_clone(new_vma, vma))
> > +               if (anon_vma_clone(new_vma, vma, VMA_OP_REMAP))
> >                         goto out_free_mempol;
> >                 if (new_vma->vm_file)
> >                         get_file(new_vma->vm_file);
> > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> > index 93e5792306d9..7fa56dcc53a6 100644
> > --- a/tools/testing/vma/vma_internal.h
> > +++ b/tools/testing/vma/vma_internal.h
> > @@ -600,6 +600,14 @@ struct mmap_action {
> >         bool hide_from_rmap_until_complete :1;
> >  };
> >
> > +/* Operations which modify VMAs. */
> > +enum vma_operation {
> > +       VMA_OP_SPLIT,
> > +       VMA_OP_MERGE_UNFAULTED,
> > +       VMA_OP_REMAP,
> > +       VMA_OP_FORK,
> > +};
> > +
> >  /*
> >   * Describes a VMA that is about to be mmap()'ed. Drivers may choose to
> >   * manipulate mutable fields which will cause those fields to be updated in the
> > @@ -1157,7 +1165,8 @@ static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_stru
> >         return 0;
> >  }
> >
> > -static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> > +static inline int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
> > +                                enum vma_operation operation)
> >  {
> >         /* For testing purposes. We indicate that an anon_vma has been cloned. */
> >         if (src->anon_vma != NULL) {
> > --
> > 2.52.0
> >