[PATCH] mm/rmap: make num_children and num_active_vmas update in internally

Yajun Deng posted 1 patch 4 days, 12 hours ago
There is a newer version of this series
mm/rmap.c | 63 ++++++++++++++++++++++++++++---------------------------
1 file changed, 32 insertions(+), 31 deletions(-)
[PATCH] mm/rmap: make num_children and num_active_vmas update in internally
Posted by Yajun Deng 4 days, 12 hours ago
If the anon_vma_alloc() is called, the num_children of the parent of
the anon_vma will be updated. But this operation occurs outside of
anon_vma_alloc().

The num_active_vmas are also updated outside of anon_vma.

Pass the parent of anon_vma to the anon_vma_alloc() and update the
num_children inside it.

Introduce anon_vma_attach() and anon_vma_detach() to update
num_active_vmas with the anon_vma.

Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
---
 mm/rmap.c | 63 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 34333ae3bd80..2a28edfa5734 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -86,15 +86,21 @@
 static struct kmem_cache *anon_vma_cachep;
 static struct kmem_cache *anon_vma_chain_cachep;
 
-static inline struct anon_vma *anon_vma_alloc(void)
+static inline struct anon_vma *anon_vma_alloc(struct anon_vma *parent)
 {
 	struct anon_vma *anon_vma;
 
 	anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
-	if (anon_vma) {
-		atomic_set(&anon_vma->refcount, 1);
-		anon_vma->num_children = 0;
-		anon_vma->num_active_vmas = 0;
+	if (!anon_vma)
+		return NULL;
+
+	atomic_set(&anon_vma->refcount, 1);
+	anon_vma->num_children = 0;
+	anon_vma->num_active_vmas = 0;
+	if (parent) {
+		anon_vma->parent = parent;
+		anon_vma->root = parent->root;
+	} else {
 		anon_vma->parent = anon_vma;
 		/*
 		 * Initialise the anon_vma root to point to itself. If called
@@ -102,6 +108,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
 		 */
 		anon_vma->root = anon_vma;
 	}
+	anon_vma->parent->num_children++;
 
 	return anon_vma;
 }
@@ -146,6 +153,19 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
 	kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
 }
 
+static inline void anon_vma_attach(struct vm_area_struct *vma,
+				   struct anon_vma *anon_vma)
+{
+	vma->anon_vma = anon_vma;
+	vma->anon_vma->num_active_vmas++;
+}
+
+static inline void anon_vma_detach(struct vm_area_struct *vma)
+{
+	vma->anon_vma->num_active_vmas--;
+	vma->anon_vma = NULL;
+}
+
 static void anon_vma_chain_link(struct vm_area_struct *vma,
 				struct anon_vma_chain *avc,
 				struct anon_vma *anon_vma)
@@ -198,10 +218,9 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
 	anon_vma = find_mergeable_anon_vma(vma);
 	allocated = NULL;
 	if (!anon_vma) {
-		anon_vma = anon_vma_alloc();
+		anon_vma = anon_vma_alloc(NULL);
 		if (unlikely(!anon_vma))
 			goto out_enomem_free_avc;
-		anon_vma->num_children++; /* self-parent link for new root */
 		allocated = anon_vma;
 	}
 
@@ -209,9 +228,8 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
 	/* page_table_lock to protect against threads */
 	spin_lock(&mm->page_table_lock);
 	if (likely(!vma->anon_vma)) {
-		vma->anon_vma = anon_vma;
+		anon_vma_attach(vma, anon_vma);
 		anon_vma_chain_link(vma, avc, anon_vma);
-		anon_vma->num_active_vmas++;
 		allocated = NULL;
 		avc = NULL;
 	}
@@ -306,10 +324,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 		if (!dst->anon_vma && src->anon_vma &&
 		    anon_vma->num_children < 2 &&
 		    anon_vma->num_active_vmas == 0)
-			dst->anon_vma = anon_vma;
+			anon_vma_attach(dst, anon_vma);
 	}
-	if (dst->anon_vma)
-		dst->anon_vma->num_active_vmas++;
 	unlock_anon_vma_root(root);
 	return 0;
 
@@ -356,31 +372,22 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
 		return 0;
 
 	/* Then add our own anon_vma. */
-	anon_vma = anon_vma_alloc();
+	anon_vma = anon_vma_alloc(pvma->anon_vma);
 	if (!anon_vma)
 		goto out_error;
-	anon_vma->num_active_vmas++;
 	avc = anon_vma_chain_alloc(GFP_KERNEL);
 	if (!avc)
 		goto out_error_free_anon_vma;
 
-	/*
-	 * The root anon_vma's rwsem is the lock actually used when we
-	 * lock any of the anon_vmas in this anon_vma tree.
-	 */
-	anon_vma->root = pvma->anon_vma->root;
-	anon_vma->parent = pvma->anon_vma;
 	/*
 	 * With refcounts, an anon_vma can stay around longer than the
 	 * process it belongs to. The root anon_vma needs to be pinned until
 	 * this anon_vma is freed, because the lock lives in the root.
 	 */
 	get_anon_vma(anon_vma->root);
-	/* Mark this anon_vma as the one where our new (COWed) pages go. */
-	vma->anon_vma = anon_vma;
+	anon_vma_attach(vma, anon_vma);
 	anon_vma_lock_write(anon_vma);
 	anon_vma_chain_link(vma, avc, anon_vma);
-	anon_vma->parent->num_children++;
 	anon_vma_unlock_write(anon_vma);
 
 	return 0;
@@ -419,15 +426,9 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
 		list_del(&avc->same_vma);
 		anon_vma_chain_free(avc);
 	}
-	if (vma->anon_vma) {
-		vma->anon_vma->num_active_vmas--;
+	if (vma->anon_vma)
+		anon_vma_detach(vma);
 
-		/*
-		 * vma would still be needed after unlink, and anon_vma will be prepared
-		 * when handle fault.
-		 */
-		vma->anon_vma = NULL;
-	}
 	unlock_anon_vma_root(root);
 
 	/*
-- 
2.25.1
[syzbot ci] Re: mm/rmap: make num_children and num_active_vmas update in internally
Posted by syzbot ci 1 day, 19 hours ago
syzbot ci has tested the following series

[v1] mm/rmap: make num_children and num_active_vmas update in internally
https://lore.kernel.org/all/20250905132019.18915-1-yajun.deng@linux.dev
* [PATCH] mm/rmap: make num_children and num_active_vmas update in internally

and found the following issue:
WARNING in unlink_anon_vmas

Full report is available here:
https://ci.syzbot.org/series/8f2c8574-826d-4951-a11c-201f9e6fa666

***

WARNING in unlink_anon_vmas

tree:      torvalds
URL:       https://kernel.googlesource.com/pub/scm/linux/kernel/git/torvalds/linux
base:      b320789d6883cc00ac78ce83bccbfe7ed58afcf0
arch:      amd64
compiler:  Debian clang version 20.1.8 (++20250708063551+0c9f909b7976-1~exp1~20250708183702.136), Debian LLD 20.1.8
config:    https://ci.syzbot.org/builds/6702bc54-be0f-4db2-85d2-1488a2ec7040/config

------------[ cut here ]------------
WARNING: CPU: 1 PID: 5260 at mm/rmap.c:444 unlink_anon_vmas+0x63c/0x670
Modules linked in:
CPU: 1 UID: 0 PID: 5260 Comm: mount Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
RIP: 0010:unlink_anon_vmas+0x63c/0x670
Code: 08 00 48 3b 6c 24 08 74 29 e8 30 7e af ff 49 89 ee e9 d0 fd ff ff e8 23 7e af ff 90 0f 0b 90 e9 4e fe ff ff e8 15 7e af ff 90 <0f> 0b 90 e9 82 fe ff ff e8 07 7e af ff eb 05 e8 00 7e af ff 48 83
RSP: 0018:ffffc90001bef878 EFLAGS: 00010293
RAX: ffffffff82102ffb RBX: ffff888109476118 RCX: ffff8880215d9cc0
RDX: 0000000000000000 RSI: ffffffffffffffff RDI: 0000000000000000
RBP: ffff88810f31e030 R08: ffff88810947611f R09: 1ffff1102128ec23
R10: dffffc0000000000 R11: ffffed102128ec24 R12: ffffffffffffffff
R13: 1ffff110212a6ce2 R14: ffff888109536710 R15: ffff888109476110
FS:  0000000000000000(0000) GS:ffff8881a3c18000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f8adf7b0440 CR3: 000000000df36000 CR4: 00000000000006f0
Call Trace:
 <TASK>
 free_pgtables+0x72b/0x9c0
 exit_mmap+0x444/0xb50
 __mmput+0x118/0x420
 exit_mm+0x1da/0x2c0
 do_exit+0x648/0x2300
 do_group_exit+0x21c/0x2d0
 __x64_sys_exit_group+0x3f/0x40
 x64_sys_call+0x21f7/0x2200
 do_syscall_64+0xfa/0x3b0
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f8adf86ea90
Code: Unable to access opcode bytes at 0x7f8adf86ea66.
RSP: 002b:00007fffb8de8148 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7
RAX: ffffffffffffffda RBX: 00007f8adf95f860 RCX: 00007f8adf86ea90
RDX: 00000000000000e7 RSI: 000000000000003c RDI: 0000000000000000
RBP: 00007f8adf95f860 R08: 00007fffb8de7fc0 R09: 00007fffb8de80a0
R10: 00007fffb8de8000 R11: 0000000000000246 R12: 0000000000000000
R13: 0000000000000000 R14: 00007f8adf963658 R15: 0000000000000001
 </TASK>


***

If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
  Tested-by: syzbot@syzkaller.appspotmail.com

---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.
Re: [PATCH] mm/rmap: make num_children and num_active_vmas update in internally
Posted by Liam R. Howlett 4 days, 10 hours ago
* Yajun Deng <yajun.deng@linux.dev> [250905 09:21]:
> If the anon_vma_alloc() is called, the num_children of the parent of
> the anon_vma will be updated. But this operation occurs outside of
> anon_vma_alloc().
> 
> The num_active_vmas are also updated outside of anon_vma.
> 
> Pass the parent of anon_vma to the anon_vma_alloc() and update the
> num_children inside it.
> 
> Introduce anon_vma_attach() and anon_vma_detach() to update
> num_active_vmas with the anon_vma.
> 
> Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
> ---
>  mm/rmap.c | 63 ++++++++++++++++++++++++++++---------------------------
>  1 file changed, 32 insertions(+), 31 deletions(-)
> 
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 34333ae3bd80..2a28edfa5734 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -86,15 +86,21 @@
>  static struct kmem_cache *anon_vma_cachep;
>  static struct kmem_cache *anon_vma_chain_cachep;
>  
> -static inline struct anon_vma *anon_vma_alloc(void)
> +static inline struct anon_vma *anon_vma_alloc(struct anon_vma *parent)
>  {
>  	struct anon_vma *anon_vma;
>  
>  	anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
> -	if (anon_vma) {
> -		atomic_set(&anon_vma->refcount, 1);
> -		anon_vma->num_children = 0;
> -		anon_vma->num_active_vmas = 0;
> +	if (!anon_vma)
> +		return NULL;
> +
> +	atomic_set(&anon_vma->refcount, 1);
> +	anon_vma->num_children = 0;
> +	anon_vma->num_active_vmas = 0;
> +	if (parent) {
> +		anon_vma->parent = parent;
> +		anon_vma->root = parent->root;
> +	} else {
>  		anon_vma->parent = anon_vma;
>  		/*
>  		 * Initialise the anon_vma root to point to itself. If called
> @@ -102,6 +108,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
>  		 */
>  		anon_vma->root = anon_vma;
>  	}
> +	anon_vma->parent->num_children++;
>  
>  	return anon_vma;
>  }
> @@ -146,6 +153,19 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
>  	kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
>  }
>  
> +static inline void anon_vma_attach(struct vm_area_struct *vma,
> +				   struct anon_vma *anon_vma)
> +{
> +	vma->anon_vma = anon_vma;
> +	vma->anon_vma->num_active_vmas++;
> +}
> +
> +static inline void anon_vma_detach(struct vm_area_struct *vma)
> +{
> +	vma->anon_vma->num_active_vmas--;
> +	vma->anon_vma = NULL;
> +}
> +

It is a bit odd that you are setting a vma value with the prefix of
anon_vma.  Surely there is a better name: vma_attach_anon() ?  And since
this is editing the vma, should it be in rmap.c or vma.h?

>  static void anon_vma_chain_link(struct vm_area_struct *vma,
>  				struct anon_vma_chain *avc,
>  				struct anon_vma *anon_vma)
> @@ -198,10 +218,9 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
>  	anon_vma = find_mergeable_anon_vma(vma);
>  	allocated = NULL;
>  	if (!anon_vma) {
> -		anon_vma = anon_vma_alloc();
> +		anon_vma = anon_vma_alloc(NULL);

I don't love passing NULL for parent, it's two if statements to do the
same work as before - we already know that parent is NULL by this point,
but we call a function to check it again.

>  		if (unlikely(!anon_vma))
>  			goto out_enomem_free_avc;
> -		anon_vma->num_children++; /* self-parent link for new root */
>  		allocated = anon_vma;
>  	}
>  
> @@ -209,9 +228,8 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
>  	/* page_table_lock to protect against threads */
>  	spin_lock(&mm->page_table_lock);
>  	if (likely(!vma->anon_vma)) {
> -		vma->anon_vma = anon_vma;
> +		anon_vma_attach(vma, anon_vma);
>  		anon_vma_chain_link(vma, avc, anon_vma);
> -		anon_vma->num_active_vmas++;
>  		allocated = NULL;
>  		avc = NULL;
>  	}
> @@ -306,10 +324,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
>  		if (!dst->anon_vma && src->anon_vma &&
>  		    anon_vma->num_children < 2 &&
>  		    anon_vma->num_active_vmas == 0)
> -			dst->anon_vma = anon_vma;
> +			anon_vma_attach(dst, anon_vma);
>  	}
> -	if (dst->anon_vma)
> -		dst->anon_vma->num_active_vmas++;
>  	unlock_anon_vma_root(root);
>  	return 0;

anon_vma_clone() has a goto label of enomem_failure that needs to be
handled correctly.  Looks like you have to avoid zeroing dst before
unlink_anon_vmas(vma) there.

>  
> @@ -356,31 +372,22 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
>  		return 0;
>  
>  	/* Then add our own anon_vma. */
> -	anon_vma = anon_vma_alloc();
> +	anon_vma = anon_vma_alloc(pvma->anon_vma);
>  	if (!anon_vma)
>  		goto out_error;
> -	anon_vma->num_active_vmas++;
>  	avc = anon_vma_chain_alloc(GFP_KERNEL);
>  	if (!avc)
>  		goto out_error_free_anon_vma;

At this point anon_vma has a parent set and the parent->num_children++,
but vma->anon_vma != anon_vma yet.  If avc fails here, we will put the
anon_vma but leave the parent with num_children incremented, since
unlink_anon_vmas() will not find anything.

>  
> -	/*
> -	 * The root anon_vma's rwsem is the lock actually used when we
> -	 * lock any of the anon_vmas in this anon_vma tree.
> -	 */

This information is lost when adding the parent passthrough.

> -	anon_vma->root = pvma->anon_vma->root;
> -	anon_vma->parent = pvma->anon_vma;
>  	/*
>  	 * With refcounts, an anon_vma can stay around longer than the
>  	 * process it belongs to. The root anon_vma needs to be pinned until
>  	 * this anon_vma is freed, because the lock lives in the root.
>  	 */
>  	get_anon_vma(anon_vma->root);
> -	/* Mark this anon_vma as the one where our new (COWed) pages go. */
> -	vma->anon_vma = anon_vma;
> +	anon_vma_attach(vma, anon_vma);

So now we are in the same situation, we know what we need to do with the
parent, but we have to run through another if statement to get it to
happen instead of assigning it.

>  	anon_vma_lock_write(anon_vma);
>  	anon_vma_chain_link(vma, avc, anon_vma);
> -	anon_vma->parent->num_children++;
>  	anon_vma_unlock_write(anon_vma);
>  
>  	return 0;
> @@ -419,15 +426,9 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
>  		list_del(&avc->same_vma);
>  		anon_vma_chain_free(avc);
>  	}
> -	if (vma->anon_vma) {
> -		vma->anon_vma->num_active_vmas--;
> +	if (vma->anon_vma)
> +		anon_vma_detach(vma);
>  
> -		/*
> -		 * vma would still be needed after unlink, and anon_vma will be prepared
> -		 * when handle fault.
> -		 */

It is still worth keeping the comment here too.

> -		vma->anon_vma = NULL;
> -	}
>  	unlock_anon_vma_root(root);
>  
>  	/*
> -- 
> 2.25.1
> 
>
Re: [PATCH] mm/rmap: make num_children and num_active_vmas update in internally
Posted by Yajun Deng 3 days, 10 hours ago
September 5, 2025 at 11:16 PM, "Liam R. Howlett" <Liam.Howlett@oracle.com mailto:Liam.Howlett@oracle.com?to=%22Liam%20R.%20Howlett%22%20%3CLiam.Howlett%40oracle.com%3E > wrote:


> 
> * Yajun Deng <yajun.deng@linux.dev> [250905 09:21]:
> 
> > 
> > If the anon_vma_alloc() is called, the num_children of the parent of
> >  the anon_vma will be updated. But this operation occurs outside of
> >  anon_vma_alloc().
> >  
> >  The num_active_vmas are also updated outside of anon_vma.
> >  
> >  Pass the parent of anon_vma to the anon_vma_alloc() and update the
> >  num_children inside it.
> >  
> >  Introduce anon_vma_attach() and anon_vma_detach() to update
> >  num_active_vmas with the anon_vma.
> >  
> >  Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
> >  ---
> >  mm/rmap.c | 63 ++++++++++++++++++++++++++++---------------------------
> >  1 file changed, 32 insertions(+), 31 deletions(-)
> >  
> >  diff --git a/mm/rmap.c b/mm/rmap.c
> >  index 34333ae3bd80..2a28edfa5734 100644
> >  --- a/mm/rmap.c
> >  +++ b/mm/rmap.c
> >  @@ -86,15 +86,21 @@
> >  static struct kmem_cache *anon_vma_cachep;
> >  static struct kmem_cache *anon_vma_chain_cachep;
> >  
> >  -static inline struct anon_vma *anon_vma_alloc(void)
> >  +static inline struct anon_vma *anon_vma_alloc(struct anon_vma *parent)
> >  {
> >  struct anon_vma *anon_vma;
> >  
> >  anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
> >  - if (anon_vma) {
> >  - atomic_set(&anon_vma->refcount, 1);
> >  - anon_vma->num_children = 0;
> >  - anon_vma->num_active_vmas = 0;
> >  + if (!anon_vma)
> >  + return NULL;
> >  +
> >  + atomic_set(&anon_vma->refcount, 1);
> >  + anon_vma->num_children = 0;
> >  + anon_vma->num_active_vmas = 0;
> >  + if (parent) {
> >  + anon_vma->parent = parent;
> >  + anon_vma->root = parent->root;
> >  + } else {
> >  anon_vma->parent = anon_vma;
> >  /*
> >  * Initialise the anon_vma root to point to itself. If called
> >  @@ -102,6 +108,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
> >  */
> >  anon_vma->root = anon_vma;
> >  }
> >  + anon_vma->parent->num_children++;
> >  
> >  return anon_vma;
> >  }
> >  @@ -146,6 +153,19 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
> >  kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
> >  }
> >  
> >  +static inline void anon_vma_attach(struct vm_area_struct *vma,
> >  + struct anon_vma *anon_vma)
> >  +{
> >  + vma->anon_vma = anon_vma;
> >  + vma->anon_vma->num_active_vmas++;
> >  +}
> >  +
> >  +static inline void anon_vma_detach(struct vm_area_struct *vma)
> >  +{
> >  + vma->anon_vma->num_active_vmas--;
> >  + vma->anon_vma = NULL;
> >  +}
> >  +
> > 
> It is a bit odd that you are setting a vma value with the prefix of
> anon_vma. Surely there is a better name: vma_attach_anon() ? And since
> this is editing the vma, should it be in rmap.c or vma.h?
> 

I will move them to vma.h.
> > 
> > static void anon_vma_chain_link(struct vm_area_struct *vma,
> >  struct anon_vma_chain *avc,
> >  struct anon_vma *anon_vma)
> >  @@ -198,10 +218,9 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
> >  anon_vma = find_mergeable_anon_vma(vma);
> >  allocated = NULL;
> >  if (!anon_vma) {
> >  - anon_vma = anon_vma_alloc();
> >  + anon_vma = anon_vma_alloc(NULL);
> > 
> I don't love passing NULL for parent, it's two if statements to do the
> same work as before - we already know that parent is NULL by this point,
> but we call a function to check it again.
> 

I will add a wapper function.
> > 
> > if (unlikely(!anon_vma))
> >  goto out_enomem_free_avc;
> >  - anon_vma->num_children++; /* self-parent link for new root */
> >  allocated = anon_vma;
> >  }
> >  
> >  @@ -209,9 +228,8 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
> >  /* page_table_lock to protect against threads */
> >  spin_lock(&mm->page_table_lock);
> >  if (likely(!vma->anon_vma)) {
> >  - vma->anon_vma = anon_vma;
> >  + anon_vma_attach(vma, anon_vma);
> >  anon_vma_chain_link(vma, avc, anon_vma);
> >  - anon_vma->num_active_vmas++;
> >  allocated = NULL;
> >  avc = NULL;
> >  }
> >  @@ -306,10 +324,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> >  if (!dst->anon_vma && src->anon_vma &&
> >  anon_vma->num_children < 2 &&
> >  anon_vma->num_active_vmas == 0)
> >  - dst->anon_vma = anon_vma;
> >  + anon_vma_attach(dst, anon_vma);
> >  }
> >  - if (dst->anon_vma)
> >  - dst->anon_vma->num_active_vmas++;
> >  unlock_anon_vma_root(root);
> >  return 0;
> > 
> anon_vma_clone() has a goto label of enomem_failure that needs to be
> handled correctly. Looks like you have to avoid zeroing dst before
> unlink_anon_vmas(vma) there.
> 

Yes, it's an error.
> > 
> > @@ -356,31 +372,22 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
> >  return 0;
> >  
> >  /* Then add our own anon_vma. */
> >  - anon_vma = anon_vma_alloc();
> >  + anon_vma = anon_vma_alloc(pvma->anon_vma);
> >  if (!anon_vma)
> >  goto out_error;
> >  - anon_vma->num_active_vmas++;
> >  avc = anon_vma_chain_alloc(GFP_KERNEL);
> >  if (!avc)
> >  goto out_error_free_anon_vma;
> > 
> At this point anon_vma has a parent set and the parent->num_children++,
> but vma->anon_vma != anon_vma yet. If avc fails here, we will put the
> anon_vma but leave the parent with num_children incremented, since
> unlink_anon_vmas() will not find anything.
> 

Yes, it's an error.
> > 
> > - /*
> >  - * The root anon_vma's rwsem is the lock actually used when we
> >  - * lock any of the anon_vmas in this anon_vma tree.
> >  - */
> > 
> This information is lost when adding the parent passthrough.
> 

I'll add it back.

> > 
> > - anon_vma->root = pvma->anon_vma->root;
> >  - anon_vma->parent = pvma->anon_vma;
> >  /*
> >  * With refcounts, an anon_vma can stay around longer than the
> >  * process it belongs to. The root anon_vma needs to be pinned until
> >  * this anon_vma is freed, because the lock lives in the root.
> >  */
> >  get_anon_vma(anon_vma->root);
> >  - /* Mark this anon_vma as the one where our new (COWed) pages go. */
> >  - vma->anon_vma = anon_vma;
> >  + anon_vma_attach(vma, anon_vma);
> > 
> So now we are in the same situation, we know what we need to do with the
> parent, but we have to run through another if statement to get it to
> happen instead of assigning it.
> 

Some code like it.
init_tg_rt_entry() has two callers. One has a parent, the other does not.

> > 
> > anon_vma_lock_write(anon_vma);
> >  anon_vma_chain_link(vma, avc, anon_vma);
> >  - anon_vma->parent->num_children++;
> >  anon_vma_unlock_write(anon_vma);
> >  
> >  return 0;
> >  @@ -419,15 +426,9 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
> >  list_del(&avc->same_vma);
> >  anon_vma_chain_free(avc);
> >  }
> >  - if (vma->anon_vma) {
> >  - vma->anon_vma->num_active_vmas--;
> >  + if (vma->anon_vma)
> >  + anon_vma_detach(vma);
> >  
> >  - /*
> >  - * vma would still be needed after unlink, and anon_vma will be prepared
> >  - * when handle fault.
> >  - */
> > 
> It is still worth keeping the comment here too.
> 

Okay.

> > 
> > - vma->anon_vma = NULL;
> >  - }
> >  unlock_anon_vma_root(root);
> >  
> >  /*
> >  -- 
> >  2.25.1
> >
>
Re: [PATCH] mm/rmap: make num_children and num_active_vmas update in internally
Posted by Lorenzo Stoakes 4 days, 10 hours ago
On Fri, Sep 05, 2025 at 01:20:19PM +0000, Yajun Deng wrote:
> If the anon_vma_alloc() is called, the num_children of the parent of
> the anon_vma will be updated. But this operation occurs outside of
> anon_vma_alloc().

I don't like that what is supposed to be an allocation function is now also
doing additional work.

And there's literally only one place where this matters, since
__anon_vma_prepare() doesn't have a parent, it's ltierally only anon_vma_fork().

And there are only 2 callers, so I don't see the purpose in doing this?

>
> The num_active_vmas are also updated outside of anon_vma.

I don't know what 'outside of anon_vma' means?


>
> Pass the parent of anon_vma to the anon_vma_alloc() and update the
> num_children inside it.
>
> Introduce anon_vma_attach() and anon_vma_detach() to update
> num_active_vmas with the anon_vma.

I really dislike this naming - VMA detachment means something entirely
different, you're not really 'attaching' or 'detaching' anything you're just
changing a single stat of the ones you'd need to change were you truly doing so
etc. etc.

It's misleading.

>
> Signed-off-by: Yajun Deng <yajun.deng@linux.dev>

Did you test this at all? It causes an immediate kernel panic for me when I run
it in a VM:

In exit_mmap() -> free->pgtables() -> unlink_anon_vmas()

I haven't really dug into why but yeah, this is broken.


> ---
>  mm/rmap.c | 63 ++++++++++++++++++++++++++++---------------------------
>  1 file changed, 32 insertions(+), 31 deletions(-)
>
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 34333ae3bd80..2a28edfa5734 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -86,15 +86,21 @@
>  static struct kmem_cache *anon_vma_cachep;
>  static struct kmem_cache *anon_vma_chain_cachep;
>
> -static inline struct anon_vma *anon_vma_alloc(void)
> +static inline struct anon_vma *anon_vma_alloc(struct anon_vma *parent)

I really dislike this, we only allocate with a parent in the fork case, this is
just not a win imo + adds confusion.

>  {
>  	struct anon_vma *anon_vma;
>
>  	anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
> -	if (anon_vma) {
> -		atomic_set(&anon_vma->refcount, 1);
> -		anon_vma->num_children = 0;
> -		anon_vma->num_active_vmas = 0;
> +	if (!anon_vma)
> +		return NULL;
> +
> +	atomic_set(&anon_vma->refcount, 1);
> +	anon_vma->num_children = 0;
> +	anon_vma->num_active_vmas = 0;
> +	if (parent) {
> +		anon_vma->parent = parent;
> +		anon_vma->root = parent->root;

You are accessing parent fields without a lock. This is not good.

> +	} else {
>  		anon_vma->parent = anon_vma;
>  		/*
>  		 * Initialise the anon_vma root to point to itself. If called
> @@ -102,6 +108,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
>  		 */
>  		anon_vma->root = anon_vma;
>  	}
> +	anon_vma->parent->num_children++;

This is even even even more not good, because you're accessing the parent
without a lock, which is just completely broken.

I note below where you're doing this.

>
>  	return anon_vma;
>  }
> @@ -146,6 +153,19 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
>  	kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
>  }
>
> +static inline void anon_vma_attach(struct vm_area_struct *vma,
> +				   struct anon_vma *anon_vma)
> +{
> +	vma->anon_vma = anon_vma;
> +	vma->anon_vma->num_active_vmas++;
> +}

Yeah I hate the naming, you should have lock asserts here, I don't love that
we're pairing the vma and anon_vma like this, and again I just really question
the value of this.

> +
> +static inline void anon_vma_detach(struct vm_area_struct *vma)
> +{
> +	vma->anon_vma->num_active_vmas--;
> +	vma->anon_vma = NULL;
> +}
> +
>  static void anon_vma_chain_link(struct vm_area_struct *vma,
>  				struct anon_vma_chain *avc,
>  				struct anon_vma *anon_vma)
> @@ -198,10 +218,9 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
>  	anon_vma = find_mergeable_anon_vma(vma);
>  	allocated = NULL;
>  	if (!anon_vma) {
> -		anon_vma = anon_vma_alloc();
> +		anon_vma = anon_vma_alloc(NULL);

This 'arbitrary parameter which is NULL' is also pretty horrible.

>  		if (unlikely(!anon_vma))
>  			goto out_enomem_free_avc;
> -		anon_vma->num_children++; /* self-parent link for new root */
>  		allocated = anon_vma;
>  	}
>
> @@ -209,9 +228,8 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
>  	/* page_table_lock to protect against threads */
>  	spin_lock(&mm->page_table_lock);
>  	if (likely(!vma->anon_vma)) {
> -		vma->anon_vma = anon_vma;
> +		anon_vma_attach(vma, anon_vma);
>  		anon_vma_chain_link(vma, avc, anon_vma);
> -		anon_vma->num_active_vmas++;
>  		allocated = NULL;
>  		avc = NULL;
>  	}
> @@ -306,10 +324,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
>  		if (!dst->anon_vma && src->anon_vma &&
>  		    anon_vma->num_children < 2 &&
>  		    anon_vma->num_active_vmas == 0)
> -			dst->anon_vma = anon_vma;
> +			anon_vma_attach(dst, anon_vma);
>  	}
> -	if (dst->anon_vma)
> -		dst->anon_vma->num_active_vmas++;

You're now losing the cases where we didn't reuse an anon_vma but dst->anon_vma
!= NULL? This is just broken isn't it?

>  	unlock_anon_vma_root(root);
>  	return 0;
>
> @@ -356,31 +372,22 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
>  		return 0;
>
>  	/* Then add our own anon_vma. */
> -	anon_vma = anon_vma_alloc();
> +	anon_vma = anon_vma_alloc(pvma->anon_vma);
>  	if (!anon_vma)
>  		goto out_error;
> -	anon_vma->num_active_vmas++;
>  	avc = anon_vma_chain_alloc(GFP_KERNEL);
>  	if (!avc)
>  		goto out_error_free_anon_vma;
>
> -	/*
> -	 * The root anon_vma's rwsem is the lock actually used when we
> -	 * lock any of the anon_vmas in this anon_vma tree.
> -	 */

Please please PLEASE do not delete comments like this.

> -	anon_vma->root = pvma->anon_vma->root;
> -	anon_vma->parent = pvma->anon_vma;

Yeah this is just not worth it in my opinion. You're putting code specific to
forking in anon_vma_alloc(), which means you've made the code harder to
understand.

>  	/*
>  	 * With refcounts, an anon_vma can stay around longer than the
>  	 * process it belongs to. The root anon_vma needs to be pinned until
>  	 * this anon_vma is freed, because the lock lives in the root.
>  	 */
>  	get_anon_vma(anon_vma->root);
> -	/* Mark this anon_vma as the one where our new (COWed) pages go. */

Again, please do not remove comments like this.

> -	vma->anon_vma = anon_vma;
> +	anon_vma_attach(vma, anon_vma);
>  	anon_vma_lock_write(anon_vma);
>  	anon_vma_chain_link(vma, avc, anon_vma);
> -	anon_vma->parent->num_children++;

So now we're updating things not under the lock?... this is extremely broken.

>  	anon_vma_unlock_write(anon_vma);
>
>  	return 0;
> @@ -419,15 +426,9 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
>  		list_del(&avc->same_vma);
>  		anon_vma_chain_free(avc);
>  	}
> -	if (vma->anon_vma) {
> -		vma->anon_vma->num_active_vmas--;
> +	if (vma->anon_vma)
> +		anon_vma_detach(vma);
>
> -		/*
> -		 * vma would still be needed after unlink, and anon_vma will be prepared
> -		 * when handle fault.
> -		 */

You are removing key documentation of behaviour, please do not do that.

> -		vma->anon_vma = NULL;
> -	}
>  	unlock_anon_vma_root(root);
>
>  	/*
> --
> 2.25.1
>

Overall this patch is really quite broken, but I don't think the general concept
_as implemented here_ really gives much value.

I _like_ the idea of pairing adjustment of these kinds of anon_vma fields like
num_children, num_active_vmas etc. with operations, but I think there's probably
too many fiddly cases + a need for hellish lock stuff for there to be really
anything too viable here.

Cheers, Lorenzo
Re: [PATCH] mm/rmap: make num_children and num_active_vmas update in internally
Posted by Yajun Deng 3 days, 11 hours ago
September 5, 2025 at 10:58 PM, "Lorenzo Stoakes" <lorenzo.stoakes@oracle.com mailto:lorenzo.stoakes@oracle.com?to=%22Lorenzo%20Stoakes%22%20%3Clorenzo.stoakes%40oracle.com%3E > wrote:


> 
> On Fri, Sep 05, 2025 at 01:20:19PM +0000, Yajun Deng wrote:
> 
> > 
> > If the anon_vma_alloc() is called, the num_children of the parent of
> >  the anon_vma will be updated. But this operation occurs outside of
> >  anon_vma_alloc().
> > 
> I don't like that what is supposed to be an allocation function is now also
> doing additional work.
> 
> And there's literally only one place where this matters, since
> __anon_vma_prepare() doesn't have a parent, it's ltierally only anon_vma_fork().
> 
> And there are only 2 callers, so I don't see the purpose in doing this?
> 

Yes, it is an allocation function. Some code doing like it.
alloc_fair_sched_group() and alloc_rt_sched_group() are allocation functions.
They are also pass the parent parameter to them.

The purpose of this is to unify the code. They are essentially the same.
One has itself as its parent, while another has a real parent.

> > 
> > The num_active_vmas are also updated outside of anon_vma.
> > 
> I don't know what 'outside of anon_vma' means?
> 
It means num_active_vmas should be updated in a function.

> > 
> > Pass the parent of anon_vma to the anon_vma_alloc() and update the
> >  num_children inside it.
> > 
> >  Introduce anon_vma_attach() and anon_vma_detach() to update
> >  num_active_vmas with the anon_vma.
> > 
> I really dislike this naming - VMA detachment means something entirely
> different, you're not really 'attaching' or 'detaching' anything you're just
> changing a single stat of the ones you'd need to change were you truly doing so
> etc. etc.
> 
> It's misleading.
> 

I will use vma_attach_anon() and vma_detach_anon().

> > Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
> > 
> Did you test this at all? It causes an immediate kernel panic for me when I run
> it in a VM:
> 
> In exit_mmap() -> free->pgtables() -> unlink_anon_vmas()
> 
> I haven't really dug into why but yeah, this is broken.
> 

I will fix it.

> > 
> > ---
> >  mm/rmap.c | 63 ++++++++++++++++++++++++++++---------------------------
> >  1 file changed, 32 insertions(+), 31 deletions(-)
> > 
> >  diff --git a/mm/rmap.c b/mm/rmap.c
> >  index 34333ae3bd80..2a28edfa5734 100644
> >  --- a/mm/rmap.c
> >  +++ b/mm/rmap.c
> >  @@ -86,15 +86,21 @@
> >  static struct kmem_cache *anon_vma_cachep;
> >  static struct kmem_cache *anon_vma_chain_cachep;
> > 
> >  -static inline struct anon_vma *anon_vma_alloc(void)
> >  +static inline struct anon_vma *anon_vma_alloc(struct anon_vma *parent)
> > 
> I really dislike this, we only allocate with a parent in the fork case, this is
> just not a win imo + adds confusion.
>

I will add a wapper function.
 
> > 
> > {
> >  struct anon_vma *anon_vma;
> > 
> >  anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
> >  - if (anon_vma) {
> >  - atomic_set(&anon_vma->refcount, 1);
> >  - anon_vma->num_children = 0;
> >  - anon_vma->num_active_vmas = 0;
> >  + if (!anon_vma)
> >  + return NULL;
> >  +
> >  + atomic_set(&anon_vma->refcount, 1);
> >  + anon_vma->num_children = 0;
> >  + anon_vma->num_active_vmas = 0;
> >  + if (parent) {
> >  + anon_vma->parent = parent;
> >  + anon_vma->root = parent->root;
> > 
> You are accessing parent fields without a lock. This is not good.
> 
> > 
> > + } else {
> >  anon_vma->parent = anon_vma;
> >  /*
> >  * Initialise the anon_vma root to point to itself. If called
> >  @@ -102,6 +108,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
> >  */
> >  anon_vma->root = anon_vma;
> >  }
> >  + anon_vma->parent->num_children++;
> > 
> This is even even even more not good, because you're accessing the parent
> without a lock, which is just completely broken.
> 
> I note below where you're doing this.
> 

I will add anon_vma_lock_write() and anon_vma_unlock_write().
> > 
> > return anon_vma;
> >  }
> >  @@ -146,6 +153,19 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
> >  kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
> >  }
> > 
> >  +static inline void anon_vma_attach(struct vm_area_struct *vma,
> >  + struct anon_vma *anon_vma)
> >  +{
> >  + vma->anon_vma = anon_vma;
> >  + vma->anon_vma->num_active_vmas++;
> >  +}
> > 
> Yeah I hate the naming, you should have lock asserts here, I don't love that
> we're pairing the vma and anon_vma like this, and again I just really question
> the value of this.
> 
anon_vma_chain_link() also didn't have lock assert.

> > 
> > +
> >  +static inline void anon_vma_detach(struct vm_area_struct *vma)
> >  +{
> >  + vma->anon_vma->num_active_vmas--;
> >  + vma->anon_vma = NULL;
> >  +}
> >  +
> >  static void anon_vma_chain_link(struct vm_area_struct *vma,
> >  struct anon_vma_chain *avc,
> >  struct anon_vma *anon_vma)
> >  @@ -198,10 +218,9 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
> >  anon_vma = find_mergeable_anon_vma(vma);
> >  allocated = NULL;
> >  if (!anon_vma) {
> >  - anon_vma = anon_vma_alloc();
> >  + anon_vma = anon_vma_alloc(NULL);
> > 
> This 'arbitrary parameter which is NULL' is also pretty horrible.
> 

I will add a wapper function.

> > 
> > if (unlikely(!anon_vma))
> >  goto out_enomem_free_avc;
> >  - anon_vma->num_children++; /* self-parent link for new root */
> >  allocated = anon_vma;
> >  }
> > 
> >  @@ -209,9 +228,8 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
> >  /* page_table_lock to protect against threads */
> >  spin_lock(&mm->page_table_lock);
> >  if (likely(!vma->anon_vma)) {
> >  - vma->anon_vma = anon_vma;
> >  + anon_vma_attach(vma, anon_vma);
> >  anon_vma_chain_link(vma, avc, anon_vma);
> >  - anon_vma->num_active_vmas++;
> >  allocated = NULL;
> >  avc = NULL;
> >  }
> >  @@ -306,10 +324,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> >  if (!dst->anon_vma && src->anon_vma &&
> >  anon_vma->num_children < 2 &&
> >  anon_vma->num_active_vmas == 0)
> >  - dst->anon_vma = anon_vma;
> >  + anon_vma_attach(dst, anon_vma);
> >  }
> >  - if (dst->anon_vma)
> >  - dst->anon_vma->num_active_vmas++;
> > 
> You're now losing the cases where we didn't reuse an anon_vma but dst->anon_vma
> != NULL? This is just broken isn't it?
> 

Yes, it is.
> > 
> > unlock_anon_vma_root(root);
> >  return 0;
> > 
> >  @@ -356,31 +372,22 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
> >  return 0;
> > 
> >  /* Then add our own anon_vma. */
> >  - anon_vma = anon_vma_alloc();
> >  + anon_vma = anon_vma_alloc(pvma->anon_vma);
> >  if (!anon_vma)
> >  goto out_error;
> >  - anon_vma->num_active_vmas++;
> >  avc = anon_vma_chain_alloc(GFP_KERNEL);
> >  if (!avc)
> >  goto out_error_free_anon_vma;
> > 
> >  - /*
> >  - * The root anon_vma's rwsem is the lock actually used when we
> >  - * lock any of the anon_vmas in this anon_vma tree.
> >  - */
> > 
> Please please PLEASE do not delete comments like this.
> 

Okay.
> > 
> > - anon_vma->root = pvma->anon_vma->root;
> >  - anon_vma->parent = pvma->anon_vma;
> > 
> Yeah this is just not worth it in my opinion. You're putting code specific to
> forking in anon_vma_alloc(), which means you've made the code harder to
> understand.
> 

This won't make the code harder to understand,they are essentially the same.
One has itself as its parent, while another has a real parent.

> > 
> > /*
> >  * With refcounts, an anon_vma can stay around longer than the
> >  * process it belongs to. The root anon_vma needs to be pinned until
> >  * this anon_vma is freed, because the lock lives in the root.
> >  */
> >  get_anon_vma(anon_vma->root);
> >  - /* Mark this anon_vma as the one where our new (COWed) pages go. */
> > 
> Again, please do not remove comments like this.
> 

Okay.
> > 
> > - vma->anon_vma = anon_vma;
> >  + anon_vma_attach(vma, anon_vma);
> >  anon_vma_lock_write(anon_vma);
> >  anon_vma_chain_link(vma, avc, anon_vma);
> >  - anon_vma->parent->num_children++;
> > 
> So now we're updating things not under the lock?... this is extremely broken.
> 

Yes, it's a mistake.
> > 
> > anon_vma_unlock_write(anon_vma);
> > 
> >  return 0;
> >  @@ -419,15 +426,9 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
> >  list_del(&avc->same_vma);
> >  anon_vma_chain_free(avc);
> >  }
> >  - if (vma->anon_vma) {
> >  - vma->anon_vma->num_active_vmas--;
> >  + if (vma->anon_vma)
> >  + anon_vma_detach(vma);
> > 
> >  - /*
> >  - * vma would still be needed after unlink, and anon_vma will be prepared
> >  - * when handle fault.
> >  - */
> > 
> You are removing key documentation of behaviour, please do not do that.
> 

Okay.
> > 
> > - vma->anon_vma = NULL;
> >  - }
> >  unlock_anon_vma_root(root);
> > 
> >  /*
> >  --
> >  2.25.1
> > 
> Overall this patch is really quite broken, but I don't think the general concept
> _as implemented here_ really gives much value.
> 
> I _like_ the idea of pairing adjustment of these kinds of anon_vma fields like
> num_children, num_active_vmas etc. with operations, but I think there's probably
> too many fiddly cases + a need for hellish lock stuff for there to be really
> anything too viable here.
> 

Sorry for the V1, I will send V2. 
> Cheers, Lorenzo
>
Re: [PATCH] mm/rmap: make num_children and num_active_vmas update in internally
Posted by Lorenzo Stoakes 1 day, 21 hours ago
I guess I wasn't clear enough. NAK to this whole concept. AFAICT there's
nothing of value here and you're fiddling with enormously sensitive
code.

Your first patch is incredibly broken and you've obviously not done any
testing whatsoever. This is not encouraging for code this sensitive, so If
you _insist_ on sending a respin anyway, send it as an RFC please (as this
patch should have been!)

Thanks.

On Sat, Sep 06, 2025 at 02:50:17PM +0000, Yajun Deng wrote:
> September 5, 2025 at 10:58 PM, "Lorenzo Stoakes" <lorenzo.stoakes@oracle.com mailto:lorenzo.stoakes@oracle.com?to=%22Lorenzo%20Stoakes%22%20%3Clorenzo.stoakes%40oracle.com%3E > wrote:
>
>
> >
> > On Fri, Sep 05, 2025 at 01:20:19PM +0000, Yajun Deng wrote:
> >
> > >
> > > If the anon_vma_alloc() is called, the num_children of the parent of
> > >  the anon_vma will be updated. But this operation occurs outside of
> > >  anon_vma_alloc().
> > >
> > I don't like that what is supposed to be an allocation function is now also
> > doing additional work.
> >
> > And there's literally only one place where this matters, since
> > __anon_vma_prepare() doesn't have a parent, it's ltierally only anon_vma_fork().
> >
> > And there are only 2 callers, so I don't see the purpose in doing this?
> >
>
> Yes, it is an allocation function. Some code doing like it.
> alloc_fair_sched_group() and alloc_rt_sched_group() are allocation functions.
> They are also pass the parent parameter to them.
>
> The purpose of this is to unify the code. They are essentially the same.
> One has itself as its parent, while another has a real parent.

To unify code that doesn't need unifying and moving logic specific to parts of
the code to a general place for no reason?

No, please just drop this idea altogether thanks.

>
> > >
> > > The num_active_vmas are also updated outside of anon_vma.
> > >
> > I don't know what 'outside of anon_vma' means?
> >
> It means num_active_vmas should be updated in a function.
>
> > >
> > > Pass the parent of anon_vma to the anon_vma_alloc() and update the
> > >  num_children inside it.
> > >
> > >  Introduce anon_vma_attach() and anon_vma_detach() to update
> > >  num_active_vmas with the anon_vma.
> > >
> > I really dislike this naming - VMA detachment means something entirely
> > different, you're not really 'attaching' or 'detaching' anything you're just
> > changing a single stat of the ones you'd need to change were you truly doing so
> > etc. etc.
> >
> > It's misleading.
> >
>
> I will use vma_attach_anon() and vma_detach_anon().

No. Sigh. The problem with patches like these is you end up
writing-the-code-via-review which is really not time efficient for
maintainers.

I'm really not sure there's any value here, but vma_add_anon_child(),
vma_remove_anon_child() would be _much_ better.

- No confusion vs. VMA attach/detach concept.
- Doesn't pretend to do everything needed to add a new child.

>
> > > Signed-off-by: Yajun Deng <yajun.deng@linux.dev>
> > >
> > Did you test this at all? It causes an immediate kernel panic for me when I run
> > it in a VM:
> >
> > In exit_mmap() -> free->pgtables() -> unlink_anon_vmas()
> >
> > I haven't really dug into why but yeah, this is broken.
> >
>
> I will fix it.

Please make sure to at least run your code locally before sending a patch,
thanks.

>
> > >
> > > ---
> > >  mm/rmap.c | 63 ++++++++++++++++++++++++++++---------------------------
> > >  1 file changed, 32 insertions(+), 31 deletions(-)
> > >
> > >  diff --git a/mm/rmap.c b/mm/rmap.c
> > >  index 34333ae3bd80..2a28edfa5734 100644
> > >  --- a/mm/rmap.c
> > >  +++ b/mm/rmap.c
> > >  @@ -86,15 +86,21 @@
> > >  static struct kmem_cache *anon_vma_cachep;
> > >  static struct kmem_cache *anon_vma_chain_cachep;
> > >
> > >  -static inline struct anon_vma *anon_vma_alloc(void)
> > >  +static inline struct anon_vma *anon_vma_alloc(struct anon_vma *parent)
> > >
> > I really dislike this, we only allocate with a parent in the fork case, this is
> > just not a win imo + adds confusion.
> >
>
> I will add a wapper function.

No, just drop this idea altogether thanks.

>
> > >
> > > {
> > >  struct anon_vma *anon_vma;
> > >
> > >  anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
> > >  - if (anon_vma) {
> > >  - atomic_set(&anon_vma->refcount, 1);
> > >  - anon_vma->num_children = 0;
> > >  - anon_vma->num_active_vmas = 0;
> > >  + if (!anon_vma)
> > >  + return NULL;
> > >  +
> > >  + atomic_set(&anon_vma->refcount, 1);
> > >  + anon_vma->num_children = 0;
> > >  + anon_vma->num_active_vmas = 0;
> > >  + if (parent) {
> > >  + anon_vma->parent = parent;
> > >  + anon_vma->root = parent->root;
> > >
> > You are accessing parent fields without a lock. This is not good.

Ahem ^

> >
> > >
> > > + } else {
> > >  anon_vma->parent = anon_vma;
> > >  /*
> > >  * Initialise the anon_vma root to point to itself. If called
> > >  @@ -102,6 +108,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
> > >  */
> > >  anon_vma->root = anon_vma;
> > >  }
> > >  + anon_vma->parent->num_children++;
> > >
> > This is even even even more not good, because you're accessing the parent
> > without a lock, which is just completely broken.
> >
> > I note below where you're doing this.
> >
>
> I will add anon_vma_lock_write() and anon_vma_unlock_write().

No, please don't... things are locked by callers at the appropriate time so
you'll just deadlock like this.

Your issue is you're trying to 'unify' code in a way that isn't workable.

> > >
> > > return anon_vma;
> > >  }
> > >  @@ -146,6 +153,19 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
> > >  kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
> > >  }
> > >
> > >  +static inline void anon_vma_attach(struct vm_area_struct *vma,
> > >  + struct anon_vma *anon_vma)
> > >  +{
> > >  + vma->anon_vma = anon_vma;
> > >  + vma->anon_vma->num_active_vmas++;
> > >  +}
> > >
> > Yeah I hate the naming, you should have lock asserts here, I don't love that
> > we're pairing the vma and anon_vma like this, and again I just really question
> > the value of this.
> >
> anon_vma_chain_link() also didn't have lock assert.

I'm not sure what your point is?

Presumably you doing this patch is based on an assumption code needs
improvement, and then you question a suggestion based upon... an assumption
everything is currently perfect?

>
> > >
> > > +
> > >  +static inline void anon_vma_detach(struct vm_area_struct *vma)
> > >  +{
> > >  + vma->anon_vma->num_active_vmas--;
> > >  + vma->anon_vma = NULL;
> > >  +}
> > >  +
> > >  static void anon_vma_chain_link(struct vm_area_struct *vma,
> > >  struct anon_vma_chain *avc,
> > >  struct anon_vma *anon_vma)
> > >  @@ -198,10 +218,9 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
> > >  anon_vma = find_mergeable_anon_vma(vma);
> > >  allocated = NULL;
> > >  if (!anon_vma) {
> > >  - anon_vma = anon_vma_alloc();
> > >  + anon_vma = anon_vma_alloc(NULL);
> > >
> > This 'arbitrary parameter which is NULL' is also pretty horrible.
> >
>
> I will add a wapper function.

No, just do not do this at all, thanks.

>
> > >
> > > if (unlikely(!anon_vma))
> > >  goto out_enomem_free_avc;
> > >  - anon_vma->num_children++; /* self-parent link for new root */
> > >  allocated = anon_vma;
> > >  }
> > >
> > >  @@ -209,9 +228,8 @@ int __anon_vma_prepare(struct vm_area_struct *vma)
> > >  /* page_table_lock to protect against threads */
> > >  spin_lock(&mm->page_table_lock);
> > >  if (likely(!vma->anon_vma)) {
> > >  - vma->anon_vma = anon_vma;
> > >  + anon_vma_attach(vma, anon_vma);
> > >  anon_vma_chain_link(vma, avc, anon_vma);
> > >  - anon_vma->num_active_vmas++;
> > >  allocated = NULL;
> > >  avc = NULL;
> > >  }
> > >  @@ -306,10 +324,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
> > >  if (!dst->anon_vma && src->anon_vma &&
> > >  anon_vma->num_children < 2 &&
> > >  anon_vma->num_active_vmas == 0)
> > >  - dst->anon_vma = anon_vma;
> > >  + anon_vma_attach(dst, anon_vma);
> > >  }
> > >  - if (dst->anon_vma)
> > >  - dst->anon_vma->num_active_vmas++;
> > >
> > You're now losing the cases where we didn't reuse an anon_vma but dst->anon_vma
> > != NULL? This is just broken isn't it?
> >
>
> Yes, it is.

...

> > >
> > > unlock_anon_vma_root(root);
> > >  return 0;
> > >
> > >  @@ -356,31 +372,22 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
> > >  return 0;
> > >
> > >  /* Then add our own anon_vma. */
> > >  - anon_vma = anon_vma_alloc();
> > >  + anon_vma = anon_vma_alloc(pvma->anon_vma);
> > >  if (!anon_vma)
> > >  goto out_error;
> > >  - anon_vma->num_active_vmas++;
> > >  avc = anon_vma_chain_alloc(GFP_KERNEL);
> > >  if (!avc)
> > >  goto out_error_free_anon_vma;
> > >
> > >  - /*
> > >  - * The root anon_vma's rwsem is the lock actually used when we
> > >  - * lock any of the anon_vmas in this anon_vma tree.
> > >  - */
> > >
> > Please please PLEASE do not delete comments like this.
> >
>
> Okay.
> > >
> > > - anon_vma->root = pvma->anon_vma->root;
> > >  - anon_vma->parent = pvma->anon_vma;
> > >
> > Yeah this is just not worth it in my opinion. You're putting code specific to
> > forking in anon_vma_alloc(), which means you've made the code harder to
> > understand.
> >
>
> This won't make the code harder to understand,they are essentially the same.
> One has itself as its parent, while another has a real parent.

They're NOT essentially the same. We have specific logic relating to the
parent on fork, and specific logic relating to anon_vma preparation.

It feels like you've looked at it and seen seemingly duplicated code and wanted
to find a way to write a function to share the code, but then done so without
any understanding of what is going on here.

Arbitrarily trying to squeeze logic into one function is _not_ a win.

Please just drop this parent stuff altogether, thanks.

>
> > >
> > > /*
> > >  * With refcounts, an anon_vma can stay around longer than the
> > >  * process it belongs to. The root anon_vma needs to be pinned until
> > >  * this anon_vma is freed, because the lock lives in the root.
> > >  */
> > >  get_anon_vma(anon_vma->root);
> > >  - /* Mark this anon_vma as the one where our new (COWed) pages go. */
> > >
> > Again, please do not remove comments like this.
> >
>
> Okay.
> > >
> > > - vma->anon_vma = anon_vma;
> > >  + anon_vma_attach(vma, anon_vma);
> > >  anon_vma_lock_write(anon_vma);
> > >  anon_vma_chain_link(vma, avc, anon_vma);
> > >  - anon_vma->parent->num_children++;
> > >
> > So now we're updating things not under the lock?... this is extremely broken.
> >
>
> Yes, it's a mistake.
> > >
> > > anon_vma_unlock_write(anon_vma);
> > >
> > >  return 0;
> > >  @@ -419,15 +426,9 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
> > >  list_del(&avc->same_vma);
> > >  anon_vma_chain_free(avc);
> > >  }
> > >  - if (vma->anon_vma) {
> > >  - vma->anon_vma->num_active_vmas--;
> > >  + if (vma->anon_vma)
> > >  + anon_vma_detach(vma);
> > >
> > >  - /*
> > >  - * vma would still be needed after unlink, and anon_vma will be prepared
> > >  - * when handle fault.
> > >  - */
> > >
> > You are removing key documentation of behaviour, please do not do that.
> >
>
> Okay.
> > >
> > > - vma->anon_vma = NULL;
> > >  - }
> > >  unlock_anon_vma_root(root);
> > >
> > >  /*
> > >  --
> > >  2.25.1
> > >
> > Overall this patch is really quite broken, but I don't think the general concept
> > _as implemented here_ really gives much value.
> >
> > I _like_ the idea of pairing adjustment of these kinds of anon_vma fields like
> > num_children, num_active_vmas etc. with operations, but I think there's probably
> > too many fiddly cases + a need for hellish lock stuff for there to be really
> > anything too viable here.
> >
>
> Sorry for the V1, I will send V2.

To reiterate -

I don't think there's any value in this patch at all and this code is _so_
sensitive that I'm not really hugely interested in seeing refactoring here
without very careful consideration.

I'm not convinced you're giving it that.

So I'd rather you didn't send a v2. But if you insist on doing that, send it as
an RFC please.

This patch _very clearly_ should have been an RFC as it is broken in multiple
fundamental ways.

Thanks, Lorenzo