include/linux/mmap_lock.h | 9 +- include/linux/rmap.h | 67 ------- mm/internal.h | 67 +++++++ mm/rmap.c | 315 ++++++++++++++++++++----------- mm/vma.c | 8 +- tools/testing/vma/vma_internal.h | 16 +- 6 files changed, 288 insertions(+), 194 deletions(-)
The anon_vma logic is hugely confusing and, much like a bundle of wires
entangled with one another, pulling on one thread seems only to lead to
more entanglement elsewhere.
There is a mish-mash of the core implementation, how that implementation is
invoked, how helper functions are invoked and concepts such as adjacent
anon_vma merge and anon_vma object reuse.
This series tries to improve the situation somewhat.
It starts by establishing some invariants in the core anon_vma_clone() and
unlink_anon_vmas() functions, largely expressed via VM_WARN_ON_ONCE()
asserts.
These act as some form of self-documentation as to the conditions we find
ourselves in when invoking these functions.
We also add kdoc comments for anon_vma_clone() and unlink_anon_vmas().
We then update anon_vma_fork() to avoid passing a partially set up (and
thus invalid) VMA to unlink_anon_vmas() - functions which are used both for
partially set up and valid data types has historically been the source of a
lot of confusion and bugs.
We then makes use of the established known conditions to directly skip
unfaulted VMAs (rather than implicitly via an empty vma->anon_vma_chain
list).
We remove the confusing anon_vma_merge() function (we already have a
concept of anon_vma merge in that we merge anon_vma's that would otherwise
be compatible except for attributes that mprotect() could change - which
anon_vma_merge() has nothing to do with).
We make the anon_vma functions internal to mm as they do not need to be
used by any other part of the kernel, which allows for future abstraction
without concern about this.
We then reduce the time over which we hold the anon rmap lock in
anon_vma_clone(), as it turns out we can allocate anon_vma_chain objects
without holding this lock, since the objects are not yet accessible from
the rmap.
This should reduce anon_vma lock contention.
This additionally allows us to remove a confusing GFP_NOWAIT, GFP_KERNEL
allocation fallback strategy.
Finally, we explicitly indicate which operation is being performed upon
anon_vma_clone(), and separate out fork-only logic to make it very clear
that anon_vma reuse only occurs on fork.
v3:
* Propagate tags (thanks everyone!)
* Fold fix-patches into series.
* Add fix for syzbot report about an accursed partially-initialised VMA
fault injection error path.
* Fixed a typo, a comment whitespace error I noticed and add some comments
to anon_vma_fork(), set anon_vma->num_active_vmas = 1 to make it clear
that we're setting this on a newly allocated anon_vma.
v2:
* Propagated tags (thanks all!)
* Corrected typo in 1/8 as per Suren.
* Updated commit message in 1/8 to clarify when we use a downgraded read
lock in unlink_anon_vmas(), as per Suren.
* Updated !src->anon_vma no-op comment as per Suren.
* When anon_vma_clone() fails to allocate we have thus far been invoking
unlink_anon_vmas() to clean up the partially set up VMA. However this
means we have to account for this (likely impossible) scenario in the
code and prevents further improvements. Resolve by adding a partial
cleanup function specifically for this case.
* Fixed various other typos.
* Placed check_anon_vma_clone() before the !src->anon_vma check in
anon_vma_clone() in 2/8 as per Suren.
* Retained !vma->anon_vma && !list_empty(&vma->anon_vma_chain) check on
unlink_anon_vmas() as per Liam.
* Added comment about anon_vma's sharing same root in 3/8 as per Suren.
* Updated 7/8 to have cleanup_partial_anon_vmas() do even less - since we
now allocate AVC's first before inserting into the interval tree we do
not need to acquire the lock or do anything special here, just clean up
the AVC's.
* Updated commit messages as necessary.
* Renamed find_reusable_anon_vma() to try_to_reuse_anon_vma() for clarity
as per Suren.
* Added a new assert to check_anon_vma_clone() to make it clear that, when
not forking, we expect dst->anon_vma to be set.
* Renamed vma to dst in try_to_reuse_anon_vma() to make it clear that we're
checking/manipulating the destination VMA.
https://lore.kernel.org/all/cover.1767711638.git.lorenzo.stoakes@oracle.com/
v1:
https://lore.kernel.org/all/cover.1765970117.git.lorenzo.stoakes@oracle.com/
Lorenzo Stoakes (9):
mm/rmap: improve anon_vma_clone(), unlink_anon_vmas() comments, add
asserts
mm/rmap: eliminate partial anon_vma tear-down in anon_vma_fork()
mm/rmap: skip unfaulted VMAs on anon_vma clone, unlink
mm/rmap: remove unnecessary root lock dance in anon_vma clone, unmap
mm/rmap: remove anon_vma_merge() function
mm/rmap: make anon_vma functions internal
mm/mmap_lock: add vma_is_attached() helper
mm/rmap: allocate anon_vma_chain objects unlocked when possible
mm/rmap: separate out fork-only logic on anon_vma_clone()
include/linux/mmap_lock.h | 9 +-
include/linux/rmap.h | 67 -------
mm/internal.h | 67 +++++++
mm/rmap.c | 315 ++++++++++++++++++++-----------
mm/vma.c | 8 +-
tools/testing/vma/vma_internal.h | 16 +-
6 files changed, 288 insertions(+), 194 deletions(-)
--
2.52.0
On Sun, 18 Jan 2026 14:50:36 +0000 Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> The anon_vma logic is hugely confusing and, much like a bundle of wires
> entangled with one another, pulling on one thread seems only to lead to
> more entanglement elsewhere.
>
> There is a mish-mash of the core implementation, how that implementation is
> invoked, how helper functions are invoked and concepts such as adjacent
> anon_vma merge and anon_vma object reuse.
>
> This series tries to improve the situation somewhat.
Updated, thanks.
> v3:
> * Propagate tags (thanks everyone!)
> * Fold fix-patches into series.
> * Add fix for syzbot report about an accursed partially-initialised VMA
> fault injection error path.
> * Fixed a typo, a comment whitespace error I noticed and add some comments
> to anon_vma_fork(), set anon_vma->num_active_vmas = 1 to make it clear
> that we're setting this on a newly allocated anon_vma.
Below is how this update altered mm.git:
--- a/mm/rmap.c~b
+++ a/mm/rmap.c
@@ -333,10 +333,10 @@ int anon_vma_clone(struct vm_area_struct
* are not updating the anon_vma rbtree nor are we changing
* anon_vma statistics.
*
- * Either src, dst have the same mm for which we hold an exclusive mmap
- * write lock, or we are forking and we hold it on src->vm_mm and dst is
- * not yet accessible to other threads so there's no possibliity of the
- * unlinked AVC's being observed yet.
+ * Either src, dst have the same mm for which we hold an exclusive mmap
+ * write lock, or we are forking and we hold it on src->vm_mm and dst is
+ * not yet accessible to other threads so there's no possibliity of the
+ * unlinked AVC's being observed yet.
*/
list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
avc = anon_vma_chain_alloc(GFP_KERNEL);
@@ -379,7 +379,7 @@ int anon_vma_fork(struct vm_area_struct
{
struct anon_vma_chain *avc;
struct anon_vma *anon_vma;
- int error;
+ int rc;
/* Don't bother if the parent process has no anon_vma here. */
if (!pvma->anon_vma)
@@ -388,27 +388,35 @@ int anon_vma_fork(struct vm_area_struct
/* Drop inherited anon_vma, we'll reuse existing or allocate new. */
vma->anon_vma = NULL;
+ anon_vma = anon_vma_alloc();
+ if (!anon_vma)
+ return -ENOMEM;
+ avc = anon_vma_chain_alloc(GFP_KERNEL);
+ if (!avc) {
+ put_anon_vma(anon_vma);
+ return -ENOMEM;
+ }
+
/*
* First, attach the new VMA to the parent VMA's anon_vmas,
* so rmap can find non-COWed pages in child processes.
*/
- error = anon_vma_clone(vma, pvma, VMA_OP_FORK);
- if (error)
- return error;
-
- /* An existing anon_vma has been reused, all done then. */
- if (vma->anon_vma)
- return 0;
+ rc = anon_vma_clone(vma, pvma, VMA_OP_FORK);
+ /* An error arose or an existing anon_vma was reused, all done then. */
+ if (rc || vma->anon_vma) {
+ put_anon_vma(anon_vma);
+ anon_vma_chain_free(avc);
+ return rc;
+ }
- /* Then add our own anon_vma. */
- anon_vma = anon_vma_alloc();
- if (!anon_vma)
- goto out_error;
- anon_vma->num_active_vmas++;
- avc = anon_vma_chain_alloc(GFP_KERNEL);
- if (!avc)
- goto out_error_free_anon_vma;
+ /*
+ * OK no reuse, so add our own anon_vma.
+ *
+ * Since it is not linked anywhere we can safely manipulate anon_vma
+ * fields without a lock.
+ */
+ anon_vma->num_active_vmas = 1;
/*
* The root anon_vma's rwsem is the lock actually used when we
* lock any of the anon_vmas in this anon_vma tree.
@@ -431,12 +439,6 @@ int anon_vma_fork(struct vm_area_struct
anon_vma_unlock_write(anon_vma);
return 0;
-
- out_error_free_anon_vma:
- put_anon_vma(anon_vma);
- out_error:
- unlink_anon_vmas(vma);
- return -ENOMEM;
}
/*
_
© 2016 - 2026 Red Hat, Inc.