[PATCH v9 16/17] mm: make vma cache SLAB_TYPESAFE_BY_RCU

Suren Baghdasaryan posted 17 patches 1 year ago
There is a newer version of this series
[PATCH v9 16/17] mm: make vma cache SLAB_TYPESAFE_BY_RCU
Posted by Suren Baghdasaryan 1 year ago
To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that
object reuse before RCU grace period is over will be detected by
lock_vma_under_rcu().
Current checks are sufficient as long as vma is detached before it is
freed. The only place this is not currently happening is in exit_mmap().
Add the missing vma_mark_detached() in exit_mmap().
Another issue which might trick lock_vma_under_rcu() during vma reuse
is vm_area_dup(), which copies the entire content of the vma into a new
one, overriding new vma's vm_refcnt and temporarily making it appear as
attached. This might trick a racing lock_vma_under_rcu() to operate on
a reused vma if it found the vma before it got reused. To prevent this
situation, we should ensure that vm_refcnt stays at detached state (0)
when it is copied and advances to attached state only after it is added
into the vma tree. Introduce vm_area_init_from() which preserves new
vma's vm_refcnt and use it in vm_area_dup(). Since all vmas are in
detached state with no current readers when they are freed,
lock_vma_under_rcu() will not be able to take vm_refcnt after vma got
detached even if vma is reused.
Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate
vm_area_struct reuse and will minimize the number of call_rcu() calls.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/mm.h               |  2 -
 include/linux/mm_types.h         | 13 ++++--
 include/linux/slab.h             |  6 ---
 kernel/fork.c                    | 73 ++++++++++++++++++++------------
 mm/mmap.c                        |  3 +-
 mm/vma.c                         | 11 ++---
 mm/vma.h                         |  2 +-
 tools/testing/vma/vma_internal.h |  7 +--
 8 files changed, 63 insertions(+), 54 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cb29eb7360c5..ac78425e9838 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -258,8 +258,6 @@ void setup_initial_init_mm(void *start_code, void *end_code,
 struct vm_area_struct *vm_area_alloc(struct mm_struct *);
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
 void vm_area_free(struct vm_area_struct *);
-/* Use only if VMA has no other users */
-void __vm_area_free(struct vm_area_struct *vma);
 
 #ifndef CONFIG_MMU
 extern struct rb_root nommu_region_tree;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d902e6730654..d366ec6302e6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -574,6 +574,12 @@ static inline void *folio_get_private(struct folio *folio)
 
 typedef unsigned long vm_flags_t;
 
+/*
+ * freeptr_t represents a SLUB freelist pointer, which might be encoded
+ * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
+ */
+typedef struct { unsigned long v; } freeptr_t;
+
 /*
  * A region containing a mapping of a non-memory backed file under NOMMU
  * conditions.  These are held in a global tree and are pinned by the VMAs that
@@ -677,6 +683,9 @@ struct vma_numab_state {
  *
  * Only explicitly marked struct members may be accessed by RCU readers before
  * getting a stable reference.
+ *
+ * WARNING: when adding new members, please update vm_area_init_from() to copy
+ * them during vm_area_struct content duplication.
  */
 struct vm_area_struct {
 	/* The first cache line has the info for VMA tree walking. */
@@ -687,9 +696,7 @@ struct vm_area_struct {
 			unsigned long vm_start;
 			unsigned long vm_end;
 		};
-#ifdef CONFIG_PER_VMA_LOCK
-		struct rcu_head vm_rcu;	/* Used for deferred freeing. */
-#endif
+		freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
 	};
 
 	/*
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 10a971c2bde3..681b685b6c4e 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -234,12 +234,6 @@ enum _slab_flag_bits {
 #define SLAB_NO_OBJ_EXT		__SLAB_FLAG_UNUSED
 #endif
 
-/*
- * freeptr_t represents a SLUB freelist pointer, which might be encoded
- * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
- */
-typedef struct { unsigned long v; } freeptr_t;
-
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
  *
diff --git a/kernel/fork.c b/kernel/fork.c
index 9d9275783cf8..151b40627c14 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -449,6 +449,42 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 	return vma;
 }
 
+static void vm_area_init_from(const struct vm_area_struct *src,
+			      struct vm_area_struct *dest)
+{
+	dest->vm_mm = src->vm_mm;
+	dest->vm_ops = src->vm_ops;
+	dest->vm_start = src->vm_start;
+	dest->vm_end = src->vm_end;
+	dest->anon_vma = src->anon_vma;
+	dest->vm_pgoff = src->vm_pgoff;
+	dest->vm_file = src->vm_file;
+	dest->vm_private_data = src->vm_private_data;
+	vm_flags_init(dest, src->vm_flags);
+	memcpy(&dest->vm_page_prot, &src->vm_page_prot,
+	       sizeof(dest->vm_page_prot));
+	/*
+	 * src->shared.rb may be modified concurrently when called from
+	 * dup_mmap(), but the clone will reinitialize it.
+	 */
+	data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
+	memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
+	       sizeof(dest->vm_userfaultfd_ctx));
+#ifdef CONFIG_ANON_VMA_NAME
+	dest->anon_name = src->anon_name;
+#endif
+#ifdef CONFIG_SWAP
+	memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
+	       sizeof(dest->swap_readahead_info));
+#endif
+#ifndef CONFIG_MMU
+	dest->vm_region = src->vm_region;
+#endif
+#ifdef CONFIG_NUMA
+	dest->vm_policy = src->vm_policy;
+#endif
+}
+
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 {
 	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
@@ -458,11 +494,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 
 	ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
 	ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
-	/*
-	 * orig->shared.rb may be modified concurrently, but the clone
-	 * will be reinitialized.
-	 */
-	data_race(memcpy(new, orig, sizeof(*new)));
+	vm_area_init_from(orig, new);
 	vma_lock_init(new, true);
 	INIT_LIST_HEAD(&new->anon_vma_chain);
 	vma_numab_state_init(new);
@@ -471,7 +503,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 	return new;
 }
 
-void __vm_area_free(struct vm_area_struct *vma)
+void vm_area_free(struct vm_area_struct *vma)
 {
 	/* The vma should be detached while being destroyed. */
 	vma_assert_detached(vma);
@@ -480,25 +512,6 @@ void __vm_area_free(struct vm_area_struct *vma)
 	kmem_cache_free(vm_area_cachep, vma);
 }
 
-#ifdef CONFIG_PER_VMA_LOCK
-static void vm_area_free_rcu_cb(struct rcu_head *head)
-{
-	struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
-						  vm_rcu);
-
-	__vm_area_free(vma);
-}
-#endif
-
-void vm_area_free(struct vm_area_struct *vma)
-{
-#ifdef CONFIG_PER_VMA_LOCK
-	call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
-#else
-	__vm_area_free(vma);
-#endif
-}
-
 static void account_kernel_stack(struct task_struct *tsk, int account)
 {
 	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@ -3144,6 +3157,11 @@ void __init mm_cache_init(void)
 
 void __init proc_caches_init(void)
 {
+	struct kmem_cache_args args = {
+		.use_freeptr_offset = true,
+		.freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
+	};
+
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -3160,8 +3178,9 @@ void __init proc_caches_init(void)
 			sizeof(struct fs_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
-	vm_area_cachep = KMEM_CACHE(vm_area_struct,
-			SLAB_HWCACHE_ALIGN|SLAB_NO_MERGE|SLAB_PANIC|
+	vm_area_cachep = kmem_cache_create("vm_area_struct",
+			sizeof(struct vm_area_struct), &args,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
 			SLAB_ACCOUNT);
 	mmap_init();
 	nsproxy_cache_init();
diff --git a/mm/mmap.c b/mm/mmap.c
index cda01071c7b1..7aa36216ecc0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1305,7 +1305,8 @@ void exit_mmap(struct mm_struct *mm)
 	do {
 		if (vma->vm_flags & VM_ACCOUNT)
 			nr_accounted += vma_pages(vma);
-		remove_vma(vma, /* unreachable = */ true);
+		vma_mark_detached(vma);
+		remove_vma(vma);
 		count++;
 		cond_resched();
 		vma = vma_next(&vmi);
diff --git a/mm/vma.c b/mm/vma.c
index 93ff42ac2002..0a5158d611e3 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -406,19 +406,14 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg,
 /*
  * Close a vm structure and free it.
  */
-void remove_vma(struct vm_area_struct *vma, bool unreachable)
+void remove_vma(struct vm_area_struct *vma)
 {
 	might_sleep();
 	vma_close(vma);
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	mpol_put(vma_policy(vma));
-	if (unreachable) {
-		vma_mark_detached(vma);
-		__vm_area_free(vma);
-	} else {
-		vm_area_free(vma);
-	}
+	vm_area_free(vma);
 }
 
 /*
@@ -1201,7 +1196,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
 	/* Remove and clean up vmas */
 	mas_set(mas_detach, 0);
 	mas_for_each(mas_detach, vma, ULONG_MAX)
-		remove_vma(vma, /* unreachable = */ false);
+		remove_vma(vma);
 
 	vm_unacct_memory(vms->nr_accounted);
 	validate_mm(mm);
diff --git a/mm/vma.h b/mm/vma.h
index 63dd38d5230c..f51005b95b39 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -170,7 +170,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
 		  unsigned long start, size_t len, struct list_head *uf,
 		  bool unlock);
 
-void remove_vma(struct vm_area_struct *vma, bool unreachable);
+void remove_vma(struct vm_area_struct *vma);
 
 void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
 		struct vm_area_struct *prev, struct vm_area_struct *next);
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 2ce032943861..49a85ce0d45a 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -697,14 +697,9 @@ static inline void mpol_put(struct mempolicy *)
 {
 }
 
-static inline void __vm_area_free(struct vm_area_struct *vma)
-{
-	free(vma);
-}
-
 static inline void vm_area_free(struct vm_area_struct *vma)
 {
-	__vm_area_free(vma);
+	free(vma);
 }
 
 static inline void lru_add_drain(void)
-- 
2.47.1.613.gc27f4b7a9f-goog
Re: [PATCH v9 16/17] mm: make vma cache SLAB_TYPESAFE_BY_RCU
Posted by Wei Yang 1 year ago
On Fri, Jan 10, 2025 at 08:26:03PM -0800, Suren Baghdasaryan wrote:

>diff --git a/kernel/fork.c b/kernel/fork.c
>index 9d9275783cf8..151b40627c14 100644
>--- a/kernel/fork.c
>+++ b/kernel/fork.c
>@@ -449,6 +449,42 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> 	return vma;
> }
> 
>+static void vm_area_init_from(const struct vm_area_struct *src,
>+			      struct vm_area_struct *dest)
>+{
>+	dest->vm_mm = src->vm_mm;
>+	dest->vm_ops = src->vm_ops;
>+	dest->vm_start = src->vm_start;
>+	dest->vm_end = src->vm_end;
>+	dest->anon_vma = src->anon_vma;
>+	dest->vm_pgoff = src->vm_pgoff;
>+	dest->vm_file = src->vm_file;
>+	dest->vm_private_data = src->vm_private_data;
>+	vm_flags_init(dest, src->vm_flags);
>+	memcpy(&dest->vm_page_prot, &src->vm_page_prot,
>+	       sizeof(dest->vm_page_prot));
>+	/*
>+	 * src->shared.rb may be modified concurrently when called from
>+	 * dup_mmap(), but the clone will reinitialize it.
>+	 */
>+	data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
>+	memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
>+	       sizeof(dest->vm_userfaultfd_ctx));
>+#ifdef CONFIG_ANON_VMA_NAME
>+	dest->anon_name = src->anon_name;
>+#endif
>+#ifdef CONFIG_SWAP
>+	memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
>+	       sizeof(dest->swap_readahead_info));
>+#endif
>+#ifndef CONFIG_MMU
>+	dest->vm_region = src->vm_region;
>+#endif
>+#ifdef CONFIG_NUMA
>+	dest->vm_policy = src->vm_policy;
>+#endif
>+}

Would this be difficult to maintain? We should make sure not miss or overwrite
anything.

-- 
Wei Yang
Help you, Help me
Re: [PATCH v9 16/17] mm: make vma cache SLAB_TYPESAFE_BY_RCU
Posted by Suren Baghdasaryan 1 year ago
On Tue, Jan 14, 2025 at 6:27 PM Wei Yang <richard.weiyang@gmail.com> wrote:
>
> On Fri, Jan 10, 2025 at 08:26:03PM -0800, Suren Baghdasaryan wrote:
>
> >diff --git a/kernel/fork.c b/kernel/fork.c
> >index 9d9275783cf8..151b40627c14 100644
> >--- a/kernel/fork.c
> >+++ b/kernel/fork.c
> >@@ -449,6 +449,42 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> >       return vma;
> > }
> >
> >+static void vm_area_init_from(const struct vm_area_struct *src,
> >+                            struct vm_area_struct *dest)
> >+{
> >+      dest->vm_mm = src->vm_mm;
> >+      dest->vm_ops = src->vm_ops;
> >+      dest->vm_start = src->vm_start;
> >+      dest->vm_end = src->vm_end;
> >+      dest->anon_vma = src->anon_vma;
> >+      dest->vm_pgoff = src->vm_pgoff;
> >+      dest->vm_file = src->vm_file;
> >+      dest->vm_private_data = src->vm_private_data;
> >+      vm_flags_init(dest, src->vm_flags);
> >+      memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> >+             sizeof(dest->vm_page_prot));
> >+      /*
> >+       * src->shared.rb may be modified concurrently when called from
> >+       * dup_mmap(), but the clone will reinitialize it.
> >+       */
> >+      data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> >+      memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> >+             sizeof(dest->vm_userfaultfd_ctx));
> >+#ifdef CONFIG_ANON_VMA_NAME
> >+      dest->anon_name = src->anon_name;
> >+#endif
> >+#ifdef CONFIG_SWAP
> >+      memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> >+             sizeof(dest->swap_readahead_info));
> >+#endif
> >+#ifndef CONFIG_MMU
> >+      dest->vm_region = src->vm_region;
> >+#endif
> >+#ifdef CONFIG_NUMA
> >+      dest->vm_policy = src->vm_policy;
> >+#endif
> >+}
>
> Would this be difficult to maintain? We should make sure not miss or overwrite
> anything.

Yeah, it is less maintainable than a simple memcpy() but I did not
find a better alternative. I added a warning above the struct
vm_area_struct definition to update this function every time we change
that structure. Not sure if there is anything else I can do to help
with this.

>
> --
> Wei Yang
> Help you, Help me
Re: [PATCH v9 16/17] mm: make vma cache SLAB_TYPESAFE_BY_RCU
Posted by Wei Yang 1 year ago
On Tue, Jan 14, 2025 at 07:15:05PM -0800, Suren Baghdasaryan wrote:
>On Tue, Jan 14, 2025 at 6:27 PM Wei Yang <richard.weiyang@gmail.com> wrote:
>>
>> On Fri, Jan 10, 2025 at 08:26:03PM -0800, Suren Baghdasaryan wrote:
>>
>> >diff --git a/kernel/fork.c b/kernel/fork.c
>> >index 9d9275783cf8..151b40627c14 100644
>> >--- a/kernel/fork.c
>> >+++ b/kernel/fork.c
>> >@@ -449,6 +449,42 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
>> >       return vma;
>> > }
>> >
>> >+static void vm_area_init_from(const struct vm_area_struct *src,
>> >+                            struct vm_area_struct *dest)
>> >+{
>> >+      dest->vm_mm = src->vm_mm;
>> >+      dest->vm_ops = src->vm_ops;
>> >+      dest->vm_start = src->vm_start;
>> >+      dest->vm_end = src->vm_end;
>> >+      dest->anon_vma = src->anon_vma;
>> >+      dest->vm_pgoff = src->vm_pgoff;
>> >+      dest->vm_file = src->vm_file;
>> >+      dest->vm_private_data = src->vm_private_data;
>> >+      vm_flags_init(dest, src->vm_flags);
>> >+      memcpy(&dest->vm_page_prot, &src->vm_page_prot,
>> >+             sizeof(dest->vm_page_prot));
>> >+      /*
>> >+       * src->shared.rb may be modified concurrently when called from
>> >+       * dup_mmap(), but the clone will reinitialize it.
>> >+       */
>> >+      data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
>> >+      memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
>> >+             sizeof(dest->vm_userfaultfd_ctx));
>> >+#ifdef CONFIG_ANON_VMA_NAME
>> >+      dest->anon_name = src->anon_name;
>> >+#endif
>> >+#ifdef CONFIG_SWAP
>> >+      memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
>> >+             sizeof(dest->swap_readahead_info));
>> >+#endif
>> >+#ifndef CONFIG_MMU
>> >+      dest->vm_region = src->vm_region;
>> >+#endif
>> >+#ifdef CONFIG_NUMA
>> >+      dest->vm_policy = src->vm_policy;
>> >+#endif
>> >+}
>>
>> Would this be difficult to maintain? We should make sure not miss or overwrite
>> anything.
>
>Yeah, it is less maintainable than a simple memcpy() but I did not
>find a better alternative. I added a warning above the struct
>vm_area_struct definition to update this function every time we change
>that structure. Not sure if there is anything else I can do to help
>with this.
>

For !PER_VMA_LOCK, maybe we can use memcpy() as usual.

For PER_VMA_LOCK, I just come up the same idea with you:-)

>>
>> --
>> Wei Yang
>> Help you, Help me

-- 
Wei Yang
Help you, Help me
Re: [PATCH v9 16/17] mm: make vma cache SLAB_TYPESAFE_BY_RCU
Posted by Suren Baghdasaryan 1 year ago
On Wed, Jan 15, 2025 at 4:17 AM Wei Yang <richard.weiyang@gmail.com> wrote:
>
> On Tue, Jan 14, 2025 at 07:15:05PM -0800, Suren Baghdasaryan wrote:
> >On Tue, Jan 14, 2025 at 6:27 PM Wei Yang <richard.weiyang@gmail.com> wrote:
> >>
> >> On Fri, Jan 10, 2025 at 08:26:03PM -0800, Suren Baghdasaryan wrote:
> >>
> >> >diff --git a/kernel/fork.c b/kernel/fork.c
> >> >index 9d9275783cf8..151b40627c14 100644
> >> >--- a/kernel/fork.c
> >> >+++ b/kernel/fork.c
> >> >@@ -449,6 +449,42 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> >> >       return vma;
> >> > }
> >> >
> >> >+static void vm_area_init_from(const struct vm_area_struct *src,
> >> >+                            struct vm_area_struct *dest)
> >> >+{
> >> >+      dest->vm_mm = src->vm_mm;
> >> >+      dest->vm_ops = src->vm_ops;
> >> >+      dest->vm_start = src->vm_start;
> >> >+      dest->vm_end = src->vm_end;
> >> >+      dest->anon_vma = src->anon_vma;
> >> >+      dest->vm_pgoff = src->vm_pgoff;
> >> >+      dest->vm_file = src->vm_file;
> >> >+      dest->vm_private_data = src->vm_private_data;
> >> >+      vm_flags_init(dest, src->vm_flags);
> >> >+      memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> >> >+             sizeof(dest->vm_page_prot));
> >> >+      /*
> >> >+       * src->shared.rb may be modified concurrently when called from
> >> >+       * dup_mmap(), but the clone will reinitialize it.
> >> >+       */
> >> >+      data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> >> >+      memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> >> >+             sizeof(dest->vm_userfaultfd_ctx));
> >> >+#ifdef CONFIG_ANON_VMA_NAME
> >> >+      dest->anon_name = src->anon_name;
> >> >+#endif
> >> >+#ifdef CONFIG_SWAP
> >> >+      memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> >> >+             sizeof(dest->swap_readahead_info));
> >> >+#endif
> >> >+#ifndef CONFIG_MMU
> >> >+      dest->vm_region = src->vm_region;
> >> >+#endif
> >> >+#ifdef CONFIG_NUMA
> >> >+      dest->vm_policy = src->vm_policy;
> >> >+#endif
> >> >+}
> >>
> >> Would this be difficult to maintain? We should make sure not miss or overwrite
> >> anything.
> >
> >Yeah, it is less maintainable than a simple memcpy() but I did not
> >find a better alternative. I added a warning above the struct
> >vm_area_struct definition to update this function every time we change
> >that structure. Not sure if there is anything else I can do to help
> >with this.
> >
>
> For !PER_VMA_LOCK, maybe we can use memcpy() as usual.
>
> For PER_VMA_LOCK, I just come up the same idea with you:-)

Missed this comment. Yeah, in one of the previous versions I had
different !PER_VMA_LOCK and PER_VMA_LOCK versions of vma_copy() but
David raised a question whether it is worth having two versions. From
performance POV there is no reason for that and it unnecessarily
complicates the code. So, I dropped that in favor of having one
version.

>
> >>
> >> --
> >> Wei Yang
> >> Help you, Help me
>
> --
> Wei Yang
> Help you, Help me
Re: [PATCH v9 16/17] mm: make vma cache SLAB_TYPESAFE_BY_RCU
Posted by Vlastimil Babka 1 year ago
On 1/15/25 04:15, Suren Baghdasaryan wrote:
> On Tue, Jan 14, 2025 at 6:27 PM Wei Yang <richard.weiyang@gmail.com> wrote:
>>
>> On Fri, Jan 10, 2025 at 08:26:03PM -0800, Suren Baghdasaryan wrote:
>>
>> >diff --git a/kernel/fork.c b/kernel/fork.c
>> >index 9d9275783cf8..151b40627c14 100644
>> >--- a/kernel/fork.c
>> >+++ b/kernel/fork.c
>> >@@ -449,6 +449,42 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
>> >       return vma;
>> > }
>> >
>> >+static void vm_area_init_from(const struct vm_area_struct *src,
>> >+                            struct vm_area_struct *dest)
>> >+{
>> >+      dest->vm_mm = src->vm_mm;
>> >+      dest->vm_ops = src->vm_ops;
>> >+      dest->vm_start = src->vm_start;
>> >+      dest->vm_end = src->vm_end;
>> >+      dest->anon_vma = src->anon_vma;
>> >+      dest->vm_pgoff = src->vm_pgoff;
>> >+      dest->vm_file = src->vm_file;
>> >+      dest->vm_private_data = src->vm_private_data;
>> >+      vm_flags_init(dest, src->vm_flags);
>> >+      memcpy(&dest->vm_page_prot, &src->vm_page_prot,
>> >+             sizeof(dest->vm_page_prot));
>> >+      /*
>> >+       * src->shared.rb may be modified concurrently when called from
>> >+       * dup_mmap(), but the clone will reinitialize it.
>> >+       */
>> >+      data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
>> >+      memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
>> >+             sizeof(dest->vm_userfaultfd_ctx));
>> >+#ifdef CONFIG_ANON_VMA_NAME
>> >+      dest->anon_name = src->anon_name;
>> >+#endif
>> >+#ifdef CONFIG_SWAP
>> >+      memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
>> >+             sizeof(dest->swap_readahead_info));
>> >+#endif
>> >+#ifndef CONFIG_MMU
>> >+      dest->vm_region = src->vm_region;
>> >+#endif
>> >+#ifdef CONFIG_NUMA
>> >+      dest->vm_policy = src->vm_policy;
>> >+#endif
>> >+}
>>
>> Would this be difficult to maintain? We should make sure not miss or overwrite
>> anything.
> 
> Yeah, it is less maintainable than a simple memcpy() but I did not
> find a better alternative.

Willy knows one but refuses to share it :(

> I added a warning above the struct
> vm_area_struct definition to update this function every time we change
> that structure. Not sure if there is anything else I can do to help
> with this.
> 
>>
>> --
>> Wei Yang
>> Help you, Help me

Re: [PATCH v9 16/17] mm: make vma cache SLAB_TYPESAFE_BY_RCU
Posted by Suren Baghdasaryan 1 year ago
On Tue, Jan 14, 2025 at 11:58 PM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 1/15/25 04:15, Suren Baghdasaryan wrote:
> > On Tue, Jan 14, 2025 at 6:27 PM Wei Yang <richard.weiyang@gmail.com> wrote:
> >>
> >> On Fri, Jan 10, 2025 at 08:26:03PM -0800, Suren Baghdasaryan wrote:
> >>
> >> >diff --git a/kernel/fork.c b/kernel/fork.c
> >> >index 9d9275783cf8..151b40627c14 100644
> >> >--- a/kernel/fork.c
> >> >+++ b/kernel/fork.c
> >> >@@ -449,6 +449,42 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> >> >       return vma;
> >> > }
> >> >
> >> >+static void vm_area_init_from(const struct vm_area_struct *src,
> >> >+                            struct vm_area_struct *dest)
> >> >+{
> >> >+      dest->vm_mm = src->vm_mm;
> >> >+      dest->vm_ops = src->vm_ops;
> >> >+      dest->vm_start = src->vm_start;
> >> >+      dest->vm_end = src->vm_end;
> >> >+      dest->anon_vma = src->anon_vma;
> >> >+      dest->vm_pgoff = src->vm_pgoff;
> >> >+      dest->vm_file = src->vm_file;
> >> >+      dest->vm_private_data = src->vm_private_data;
> >> >+      vm_flags_init(dest, src->vm_flags);
> >> >+      memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> >> >+             sizeof(dest->vm_page_prot));
> >> >+      /*
> >> >+       * src->shared.rb may be modified concurrently when called from
> >> >+       * dup_mmap(), but the clone will reinitialize it.
> >> >+       */
> >> >+      data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> >> >+      memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> >> >+             sizeof(dest->vm_userfaultfd_ctx));
> >> >+#ifdef CONFIG_ANON_VMA_NAME
> >> >+      dest->anon_name = src->anon_name;
> >> >+#endif
> >> >+#ifdef CONFIG_SWAP
> >> >+      memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> >> >+             sizeof(dest->swap_readahead_info));
> >> >+#endif
> >> >+#ifndef CONFIG_MMU
> >> >+      dest->vm_region = src->vm_region;
> >> >+#endif
> >> >+#ifdef CONFIG_NUMA
> >> >+      dest->vm_policy = src->vm_policy;
> >> >+#endif
> >> >+}
> >>
> >> Would this be difficult to maintain? We should make sure not miss or overwrite
> >> anything.
> >
> > Yeah, it is less maintainable than a simple memcpy() but I did not
> > find a better alternative.
>
> Willy knows one but refuses to share it :(

Ah, that reminds me why I dropped this approach :) But to be honest,
back then we also had vma_clear() and that added to the ugliness. Now
I could simply to this without all those macros:

static inline void vma_copy(struct vm_area_struct *new,
                                            struct vm_area_struct *orig)
{
        /* Copy the vma while preserving vma->vm_lock */
        data_race(memcpy(new, orig, offsetof(struct vm_area_struct, vm_lock)));
        data_race(memcpy(new + offsetofend(struct vm_area_struct, vm_lock),
                orig + offsetofend(struct vm_area_struct, vm_lock),
                sizeof(struct vm_area_struct) -
                offsetofend(struct vm_area_struct, vm_lock));
}

Would that be better than the current approach?

>
> > I added a warning above the struct
> > vm_area_struct definition to update this function every time we change
> > that structure. Not sure if there is anything else I can do to help
> > with this.
> >
> >>
> >> --
> >> Wei Yang
> >> Help you, Help me
>
Re: [PATCH v9 16/17] mm: make vma cache SLAB_TYPESAFE_BY_RCU
Posted by Suren Baghdasaryan 11 months, 4 weeks ago
On Wed, Jan 15, 2025 at 7:10 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Tue, Jan 14, 2025 at 11:58 PM Vlastimil Babka <vbabka@suse.cz> wrote:
> >
> > On 1/15/25 04:15, Suren Baghdasaryan wrote:
> > > On Tue, Jan 14, 2025 at 6:27 PM Wei Yang <richard.weiyang@gmail.com> wrote:
> > >>
> > >> On Fri, Jan 10, 2025 at 08:26:03PM -0800, Suren Baghdasaryan wrote:
> > >>
> > >> >diff --git a/kernel/fork.c b/kernel/fork.c
> > >> >index 9d9275783cf8..151b40627c14 100644
> > >> >--- a/kernel/fork.c
> > >> >+++ b/kernel/fork.c
> > >> >@@ -449,6 +449,42 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > >> >       return vma;
> > >> > }
> > >> >
> > >> >+static void vm_area_init_from(const struct vm_area_struct *src,
> > >> >+                            struct vm_area_struct *dest)
> > >> >+{
> > >> >+      dest->vm_mm = src->vm_mm;
> > >> >+      dest->vm_ops = src->vm_ops;
> > >> >+      dest->vm_start = src->vm_start;
> > >> >+      dest->vm_end = src->vm_end;
> > >> >+      dest->anon_vma = src->anon_vma;
> > >> >+      dest->vm_pgoff = src->vm_pgoff;
> > >> >+      dest->vm_file = src->vm_file;
> > >> >+      dest->vm_private_data = src->vm_private_data;
> > >> >+      vm_flags_init(dest, src->vm_flags);
> > >> >+      memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> > >> >+             sizeof(dest->vm_page_prot));
> > >> >+      /*
> > >> >+       * src->shared.rb may be modified concurrently when called from
> > >> >+       * dup_mmap(), but the clone will reinitialize it.
> > >> >+       */
> > >> >+      data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> > >> >+      memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> > >> >+             sizeof(dest->vm_userfaultfd_ctx));
> > >> >+#ifdef CONFIG_ANON_VMA_NAME
> > >> >+      dest->anon_name = src->anon_name;
> > >> >+#endif
> > >> >+#ifdef CONFIG_SWAP
> > >> >+      memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> > >> >+             sizeof(dest->swap_readahead_info));
> > >> >+#endif
> > >> >+#ifndef CONFIG_MMU
> > >> >+      dest->vm_region = src->vm_region;
> > >> >+#endif
> > >> >+#ifdef CONFIG_NUMA
> > >> >+      dest->vm_policy = src->vm_policy;
> > >> >+#endif
> > >> >+}
> > >>
> > >> Would this be difficult to maintain? We should make sure not miss or overwrite
> > >> anything.
> > >
> > > Yeah, it is less maintainable than a simple memcpy() but I did not
> > > find a better alternative.
> >
> > Willy knows one but refuses to share it :(
>
> Ah, that reminds me why I dropped this approach :) But to be honest,
> back then we also had vma_clear() and that added to the ugliness. Now
> I could simply to this without all those macros:
>
> static inline void vma_copy(struct vm_area_struct *new,
>                                             struct vm_area_struct *orig)
> {
>         /* Copy the vma while preserving vma->vm_lock */
>         data_race(memcpy(new, orig, offsetof(struct vm_area_struct, vm_lock)));
>         data_race(memcpy(new + offsetofend(struct vm_area_struct, vm_lock),
>                 orig + offsetofend(struct vm_area_struct, vm_lock),
>                 sizeof(struct vm_area_struct) -
>                 offsetofend(struct vm_area_struct, vm_lock));
> }
>
> Would that be better than the current approach?

I discussed proposed alternatives with Willy and he prefers the
current field-by-field copy approach. I also tried using
kmsan_check_memory() to check for uninitialized memory in the
vm_area_struct but unfortunately KMSAN stumbles on the holes in this
structure and there are 4 of them (I attached pahole output at the end
of this email). I tried unpoisoning holes but that gets very ugly very
fast. So, I posted v10
(https://lore.kernel.org/all/20250213224655.1680278-18-surenb@google.com/)
without changing this part.

struct vm_area_struct {
        union {
                struct {
                      unsigned long vm_start;          /*     0     8 */
                        unsigned long vm_end;            /*     8     8 */
                };                                       /*     0    16 */
                freeptr_t          vm_freeptr;           /*     0     8 */
        };                                               /*     0    16 */
        union {
                struct {
                        unsigned long      vm_start;             /*
 0     8 */
                        unsigned long      vm_end;               /*
 8     8 */
                };                                               /*
 0    16 */
                freeptr_t                  vm_freeptr;           /*
 0     8 */
        };

        struct mm_struct *         vm_mm;                /*    16     8 */
        pgprot_t                   vm_page_prot;         /*    24     8 */
        union {
                const vm_flags_t   vm_flags;             /*    32     8 */
                vm_flags_t         __vm_flags;           /*    32     8 */
        };                                               /*    32     8 */
        union {
                const vm_flags_t           vm_flags;             /*
 0     8 */
                vm_flags_t                 __vm_flags;           /*
 0     8 */
        };

        unsigned int               vm_lock_seq;          /*    40     4 */

        /* XXX 4 bytes hole, try to pack */

        struct list_head           anon_vma_chain;       /*    48    16 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        struct anon_vma *          anon_vma;             /*    64     8 */
        const struct vm_operations_struct  * vm_ops;     /*    72     8 */
        unsigned long              vm_pgoff;             /*    80     8 */
        struct file *              vm_file;              /*    88     8 */
        void *                     vm_private_data;      /*    96     8 */
        atomic_long_t              swap_readahead_info;  /*   104     8 */
        struct mempolicy *         vm_policy;            /*   112     8 */

        /* XXX 8 bytes hole, try to pack */

        /* --- cacheline 2 boundary (128 bytes) --- */
        refcount_t                 vm_refcnt
__attribute__((__aligned__(64))); /*   128     4 */

        /* XXX 4 bytes hole, try to pack */

        struct {
                struct rb_node     rb __attribute__((__aligned__(8)));
/*   136    24 */
                unsigned long      rb_subtree_last;      /*   160     8 */
        } shared;                                        /*   136    32 */
        struct {
                struct rb_node             rb
__attribute__((__aligned__(8))); /*     0    24 */
                unsigned long              rb_subtree_last;      /*
24     8 */

        /* size: 32, cachelines: 1, members: 2 */
        /* forced alignments: 1 */
        /* last cacheline: 32 bytes */
        };

        struct vm_userfaultfd_ctx  vm_userfaultfd_ctx;   /*   168     0 */

        /* size: 192, cachelines: 3, members: 16 */
        /* sum members: 152, holes: 3, sum holes: 16 */
        /* padding: 24 */
        /* forced alignments: 1, forced holes: 1, sum forced holes: 8 */
};

>
> >
> > > I added a warning above the struct
> > > vm_area_struct definition to update this function every time we change
> > > that structure. Not sure if there is anything else I can do to help
> > > with this.
> > >
> > >>
> > >> --
> > >> Wei Yang
> > >> Help you, Help me
> >
Re: [PATCH v9 16/17] mm: make vma cache SLAB_TYPESAFE_BY_RCU
Posted by Mateusz Guzik 1 year ago
On Wed, Jan 15, 2025 at 4:15 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Tue, Jan 14, 2025 at 6:27 PM Wei Yang <richard.weiyang@gmail.com> wrote:
> >
> > On Fri, Jan 10, 2025 at 08:26:03PM -0800, Suren Baghdasaryan wrote:
> >
> > >diff --git a/kernel/fork.c b/kernel/fork.c
> > >index 9d9275783cf8..151b40627c14 100644
> > >--- a/kernel/fork.c
> > >+++ b/kernel/fork.c
> > >@@ -449,6 +449,42 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > >       return vma;
> > > }
> > >
> > >+static void vm_area_init_from(const struct vm_area_struct *src,
> > >+                            struct vm_area_struct *dest)
> > >+{
[snip]
> > Would this be difficult to maintain? We should make sure not miss or overwrite
> > anything.
>
> Yeah, it is less maintainable than a simple memcpy() but I did not
> find a better alternative. I added a warning above the struct
> vm_area_struct definition to update this function every time we change
> that structure. Not sure if there is anything else I can do to help
> with this.
>

Bare minimum this could have a BUILD_BUG_ON in below the func for the
known-covered size. But it would have to be conditional on arch and
some macros, somewhat nasty.

KASAN or KMSAN (I don't remember which) can be used to find missing
copies. To that end the target struct could be marked as fully
uninitialized before copy and have a full read performed from it
afterwards -- guaranteed to trip over any field which any field not
explicitly covered (including padding though). I don't know what magic
macros can be used to do in Linux, I am saying the support to get this
result is there. I understand most people don't use this, but this
still should be enough to trip over buggy patches in -next.

Finally, the struct could have macros delimiting copy/non-copy
sections (with macros expanding to field names), for illustrative
purposes:
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 332cee285662..25063a3970c8 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -677,6 +677,7 @@ struct vma_numab_state {
  * getting a stable reference.
  */
 struct vm_area_struct {
+#define vma_start_copy0 vm_rcu
        /* The first cache line has the info for VMA tree walking. */

        union {
@@ -731,6 +732,7 @@ struct vm_area_struct {
        /* Unstable RCU readers are allowed to read this. */
        struct vma_lock *vm_lock;
 #endif
+#define vma_end_copy1 vm_lock

        /*
         * For areas with an address space and backing store,

then you would do everything with a series of calls

however, the __randomize_layout annotation whacks that idea (maybe it
can be retired?)

-- 
Mateusz Guzik <mjguzik gmail.com>
Re: [PATCH v9 16/17] mm: make vma cache SLAB_TYPESAFE_BY_RCU
Posted by Suren Baghdasaryan 1 year ago
On Tue, Jan 14, 2025 at 8:00 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
>
> On Wed, Jan 15, 2025 at 4:15 AM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Tue, Jan 14, 2025 at 6:27 PM Wei Yang <richard.weiyang@gmail.com> wrote:
> > >
> > > On Fri, Jan 10, 2025 at 08:26:03PM -0800, Suren Baghdasaryan wrote:
> > >
> > > >diff --git a/kernel/fork.c b/kernel/fork.c
> > > >index 9d9275783cf8..151b40627c14 100644
> > > >--- a/kernel/fork.c
> > > >+++ b/kernel/fork.c
> > > >@@ -449,6 +449,42 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > > >       return vma;
> > > > }
> > > >
> > > >+static void vm_area_init_from(const struct vm_area_struct *src,
> > > >+                            struct vm_area_struct *dest)
> > > >+{
> [snip]
> > > Would this be difficult to maintain? We should make sure not miss or overwrite
> > > anything.
> >
> > Yeah, it is less maintainable than a simple memcpy() but I did not
> > find a better alternative. I added a warning above the struct
> > vm_area_struct definition to update this function every time we change
> > that structure. Not sure if there is anything else I can do to help
> > with this.
> >
>
> Bare minimum this could have a BUILD_BUG_ON in below the func for the
> known-covered size. But it would have to be conditional on arch and
> some macros, somewhat nasty.
>
> KASAN or KMSAN (I don't remember which) can be used to find missing
> copies. To that end the target struct could be marked as fully
> uninitialized before copy and have a full read performed from it
> afterwards -- guaranteed to trip over any field which any field not
> explicitly covered (including padding though). I don't know what magic
> macros can be used to do in Linux, I am saying the support to get this
> result is there. I understand most people don't use this, but this
> still should be enough to trip over buggy patches in -next.

If my previous suggestion does not fly I'll start digging into KASAN
to see how we can use it. Thanks for the tip.

>
> Finally, the struct could have macros delimiting copy/non-copy
> sections (with macros expanding to field names), for illustrative
> purposes:
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 332cee285662..25063a3970c8 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -677,6 +677,7 @@ struct vma_numab_state {
>   * getting a stable reference.
>   */
>  struct vm_area_struct {
> +#define vma_start_copy0 vm_rcu
>         /* The first cache line has the info for VMA tree walking. */
>
>         union {
> @@ -731,6 +732,7 @@ struct vm_area_struct {
>         /* Unstable RCU readers are allowed to read this. */
>         struct vma_lock *vm_lock;
>  #endif
> +#define vma_end_copy1 vm_lock
>
>         /*
>          * For areas with an address space and backing store,
>
> then you would do everything with a series of calls

I'm not sure... My proposed approach with offsetof() I think is a bit
cleaner than adding macros to denote copy sections. WDYT?

>
> however, the __randomize_layout annotation whacks that idea (maybe it
> can be retired?)
>
> --
> Mateusz Guzik <mjguzik gmail.com>
Re: [PATCH v9 16/17] mm: make vma cache SLAB_TYPESAFE_BY_RCU
Posted by Mateusz Guzik 1 year ago
On Wed, Jan 15, 2025 at 6:47 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Tue, Jan 14, 2025 at 8:00 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> >
> > On Wed, Jan 15, 2025 at 4:15 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Tue, Jan 14, 2025 at 6:27 PM Wei Yang <richard.weiyang@gmail.com> wrote:
> > > >
> > > > On Fri, Jan 10, 2025 at 08:26:03PM -0800, Suren Baghdasaryan wrote:
> > > >
> > > > >diff --git a/kernel/fork.c b/kernel/fork.c
> > > > >index 9d9275783cf8..151b40627c14 100644
> > > > >--- a/kernel/fork.c
> > > > >+++ b/kernel/fork.c
> > > > >@@ -449,6 +449,42 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > > > >       return vma;
> > > > > }
> > > > >
> > > > >+static void vm_area_init_from(const struct vm_area_struct *src,
> > > > >+                            struct vm_area_struct *dest)
> > > > >+{
> > [snip]
> > > > Would this be difficult to maintain? We should make sure not miss or overwrite
> > > > anything.
> > >
> > > Yeah, it is less maintainable than a simple memcpy() but I did not
> > > find a better alternative. I added a warning above the struct
> > > vm_area_struct definition to update this function every time we change
> > > that structure. Not sure if there is anything else I can do to help
> > > with this.
> > >
> >
> > Bare minimum this could have a BUILD_BUG_ON in below the func for the
> > known-covered size. But it would have to be conditional on arch and
> > some macros, somewhat nasty.
> >
> > KASAN or KMSAN (I don't remember which) can be used to find missing
> > copies. To that end the target struct could be marked as fully
> > uninitialized before copy and have a full read performed from it
> > afterwards -- guaranteed to trip over any field which any field not
> > explicitly covered (including padding though). I don't know what magic
> > macros can be used to do in Linux, I am saying the support to get this
> > result is there. I understand most people don't use this, but this
> > still should be enough to trip over buggy patches in -next.
>
> If my previous suggestion does not fly I'll start digging into KASAN
> to see how we can use it. Thanks for the tip.
>
> >
> > Finally, the struct could have macros delimiting copy/non-copy
> > sections (with macros expanding to field names), for illustrative
> > purposes:
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index 332cee285662..25063a3970c8 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -677,6 +677,7 @@ struct vma_numab_state {
> >   * getting a stable reference.
> >   */
> >  struct vm_area_struct {
> > +#define vma_start_copy0 vm_rcu
> >         /* The first cache line has the info for VMA tree walking. */
> >
> >         union {
> > @@ -731,6 +732,7 @@ struct vm_area_struct {
> >         /* Unstable RCU readers are allowed to read this. */
> >         struct vma_lock *vm_lock;
> >  #endif
> > +#define vma_end_copy1 vm_lock
> >
> >         /*
> >          * For areas with an address space and backing store,
> >
> > then you would do everything with a series of calls
>
> I'm not sure... My proposed approach with offsetof() I think is a bit
> cleaner than adding macros to denote copy sections. WDYT?
>

another non-copy field may show up down the road and then the person
adding it is going to be a sad panda. wont happen if the "infra" is
there.

but I concede this is not a big deal and i'm not going to bikeshed about it.

-- 
Mateusz Guzik <mjguzik gmail.com>
Re: [PATCH v9 16/17] mm: make vma cache SLAB_TYPESAFE_BY_RCU
Posted by Suren Baghdasaryan 1 year ago
On Tue, Jan 14, 2025 at 9:52 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
>
> On Wed, Jan 15, 2025 at 6:47 AM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Tue, Jan 14, 2025 at 8:00 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> > >
> > > On Wed, Jan 15, 2025 at 4:15 AM Suren Baghdasaryan <surenb@google.com> wrote:
> > > >
> > > > On Tue, Jan 14, 2025 at 6:27 PM Wei Yang <richard.weiyang@gmail.com> wrote:
> > > > >
> > > > > On Fri, Jan 10, 2025 at 08:26:03PM -0800, Suren Baghdasaryan wrote:
> > > > >
> > > > > >diff --git a/kernel/fork.c b/kernel/fork.c
> > > > > >index 9d9275783cf8..151b40627c14 100644
> > > > > >--- a/kernel/fork.c
> > > > > >+++ b/kernel/fork.c
> > > > > >@@ -449,6 +449,42 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > > > > >       return vma;
> > > > > > }
> > > > > >
> > > > > >+static void vm_area_init_from(const struct vm_area_struct *src,
> > > > > >+                            struct vm_area_struct *dest)
> > > > > >+{
> > > [snip]
> > > > > Would this be difficult to maintain? We should make sure not miss or overwrite
> > > > > anything.
> > > >
> > > > Yeah, it is less maintainable than a simple memcpy() but I did not
> > > > find a better alternative. I added a warning above the struct
> > > > vm_area_struct definition to update this function every time we change
> > > > that structure. Not sure if there is anything else I can do to help
> > > > with this.
> > > >
> > >
> > > Bare minimum this could have a BUILD_BUG_ON in below the func for the
> > > known-covered size. But it would have to be conditional on arch and
> > > some macros, somewhat nasty.
> > >
> > > KASAN or KMSAN (I don't remember which) can be used to find missing
> > > copies. To that end the target struct could be marked as fully
> > > uninitialized before copy and have a full read performed from it
> > > afterwards -- guaranteed to trip over any field which any field not
> > > explicitly covered (including padding though). I don't know what magic
> > > macros can be used to do in Linux, I am saying the support to get this
> > > result is there. I understand most people don't use this, but this
> > > still should be enough to trip over buggy patches in -next.
> >
> > If my previous suggestion does not fly I'll start digging into KASAN
> > to see how we can use it. Thanks for the tip.
> >
> > >
> > > Finally, the struct could have macros delimiting copy/non-copy
> > > sections (with macros expanding to field names), for illustrative
> > > purposes:
> > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > > index 332cee285662..25063a3970c8 100644
> > > --- a/include/linux/mm_types.h
> > > +++ b/include/linux/mm_types.h
> > > @@ -677,6 +677,7 @@ struct vma_numab_state {
> > >   * getting a stable reference.
> > >   */
> > >  struct vm_area_struct {
> > > +#define vma_start_copy0 vm_rcu
> > >         /* The first cache line has the info for VMA tree walking. */
> > >
> > >         union {
> > > @@ -731,6 +732,7 @@ struct vm_area_struct {
> > >         /* Unstable RCU readers are allowed to read this. */
> > >         struct vma_lock *vm_lock;
> > >  #endif
> > > +#define vma_end_copy1 vm_lock
> > >
> > >         /*
> > >          * For areas with an address space and backing store,
> > >
> > > then you would do everything with a series of calls
> >
> > I'm not sure... My proposed approach with offsetof() I think is a bit
> > cleaner than adding macros to denote copy sections. WDYT?
> >
>
> another non-copy field may show up down the road and then the person
> adding it is going to be a sad panda. wont happen if the "infra" is
> there.
>
> but I concede this is not a big deal and i'm not going to bikeshed about it.

Yeah, I can't think of a perfect solution. I think we should pick a
sane one and if requirements change we can change the implementation
of vm_area_init_from() accordingly.

>
> --
> Mateusz Guzik <mjguzik gmail.com>
Re: [PATCH v9 16/17] mm: make vma cache SLAB_TYPESAFE_BY_RCU
Posted by Liam R. Howlett 1 year ago
* Suren Baghdasaryan <surenb@google.com> [250114 22:15]:
> On Tue, Jan 14, 2025 at 6:27 PM Wei Yang <richard.weiyang@gmail.com> wrote:
> >
> > On Fri, Jan 10, 2025 at 08:26:03PM -0800, Suren Baghdasaryan wrote:
> >
> > >diff --git a/kernel/fork.c b/kernel/fork.c
> > >index 9d9275783cf8..151b40627c14 100644
> > >--- a/kernel/fork.c
> > >+++ b/kernel/fork.c
> > >@@ -449,6 +449,42 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > >       return vma;
> > > }
> > >
> > >+static void vm_area_init_from(const struct vm_area_struct *src,
> > >+                            struct vm_area_struct *dest)
> > >+{
> > >+      dest->vm_mm = src->vm_mm;
> > >+      dest->vm_ops = src->vm_ops;
> > >+      dest->vm_start = src->vm_start;
> > >+      dest->vm_end = src->vm_end;
> > >+      dest->anon_vma = src->anon_vma;
> > >+      dest->vm_pgoff = src->vm_pgoff;
> > >+      dest->vm_file = src->vm_file;
> > >+      dest->vm_private_data = src->vm_private_data;
> > >+      vm_flags_init(dest, src->vm_flags);
> > >+      memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> > >+             sizeof(dest->vm_page_prot));
> > >+      /*
> > >+       * src->shared.rb may be modified concurrently when called from
> > >+       * dup_mmap(), but the clone will reinitialize it.
> > >+       */
> > >+      data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> > >+      memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> > >+             sizeof(dest->vm_userfaultfd_ctx));
> > >+#ifdef CONFIG_ANON_VMA_NAME
> > >+      dest->anon_name = src->anon_name;
> > >+#endif
> > >+#ifdef CONFIG_SWAP
> > >+      memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> > >+             sizeof(dest->swap_readahead_info));
> > >+#endif
> > >+#ifndef CONFIG_MMU
> > >+      dest->vm_region = src->vm_region;
> > >+#endif
> > >+#ifdef CONFIG_NUMA
> > >+      dest->vm_policy = src->vm_policy;
> > >+#endif
> > >+}
> >
> > Would this be difficult to maintain? We should make sure not miss or overwrite
> > anything.
> 
> Yeah, it is less maintainable than a simple memcpy() but I did not
> find a better alternative. I added a warning above the struct
> vm_area_struct definition to update this function every time we change
> that structure. Not sure if there is anything else I can do to help
> with this.

Here's a horrible idea.. if we put the ref count at the end or start of
the struct, we could set the ref count to zero and copy the other area
in one mmecpy().

Even worse idea, we could use a pointer_of like macro to get the position
of the ref count in the vma struct, set the ref count to zero and
carefully copy the other two parts in two memcpy() operations.

Feel free to disregard these ideas as it is late here and I'm having
fun thinking up bad ways to make this "more" maintainable.

Either of these would make updating the struct easier, but very painful
to debug when it goes wrong (or reading the function).

Thanks,
Liam
Re: [PATCH v9 16/17] mm: make vma cache SLAB_TYPESAFE_BY_RCU
Posted by Suren Baghdasaryan 1 year ago
On Tue, Jan 14, 2025 at 7:58 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Suren Baghdasaryan <surenb@google.com> [250114 22:15]:
> > On Tue, Jan 14, 2025 at 6:27 PM Wei Yang <richard.weiyang@gmail.com> wrote:
> > >
> > > On Fri, Jan 10, 2025 at 08:26:03PM -0800, Suren Baghdasaryan wrote:
> > >
> > > >diff --git a/kernel/fork.c b/kernel/fork.c
> > > >index 9d9275783cf8..151b40627c14 100644
> > > >--- a/kernel/fork.c
> > > >+++ b/kernel/fork.c
> > > >@@ -449,6 +449,42 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > > >       return vma;
> > > > }
> > > >
> > > >+static void vm_area_init_from(const struct vm_area_struct *src,
> > > >+                            struct vm_area_struct *dest)
> > > >+{
> > > >+      dest->vm_mm = src->vm_mm;
> > > >+      dest->vm_ops = src->vm_ops;
> > > >+      dest->vm_start = src->vm_start;
> > > >+      dest->vm_end = src->vm_end;
> > > >+      dest->anon_vma = src->anon_vma;
> > > >+      dest->vm_pgoff = src->vm_pgoff;
> > > >+      dest->vm_file = src->vm_file;
> > > >+      dest->vm_private_data = src->vm_private_data;
> > > >+      vm_flags_init(dest, src->vm_flags);
> > > >+      memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> > > >+             sizeof(dest->vm_page_prot));
> > > >+      /*
> > > >+       * src->shared.rb may be modified concurrently when called from
> > > >+       * dup_mmap(), but the clone will reinitialize it.
> > > >+       */
> > > >+      data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> > > >+      memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> > > >+             sizeof(dest->vm_userfaultfd_ctx));
> > > >+#ifdef CONFIG_ANON_VMA_NAME
> > > >+      dest->anon_name = src->anon_name;
> > > >+#endif
> > > >+#ifdef CONFIG_SWAP
> > > >+      memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> > > >+             sizeof(dest->swap_readahead_info));
> > > >+#endif
> > > >+#ifndef CONFIG_MMU
> > > >+      dest->vm_region = src->vm_region;
> > > >+#endif
> > > >+#ifdef CONFIG_NUMA
> > > >+      dest->vm_policy = src->vm_policy;
> > > >+#endif
> > > >+}
> > >
> > > Would this be difficult to maintain? We should make sure not miss or overwrite
> > > anything.
> >
> > Yeah, it is less maintainable than a simple memcpy() but I did not
> > find a better alternative. I added a warning above the struct
> > vm_area_struct definition to update this function every time we change
> > that structure. Not sure if there is anything else I can do to help
> > with this.
>
> Here's a horrible idea.. if we put the ref count at the end or start of
> the struct, we could set the ref count to zero and copy the other area
> in one mmecpy().
>
> Even worse idea, we could use a pointer_of like macro to get the position
> of the ref count in the vma struct, set the ref count to zero and
> carefully copy the other two parts in two memcpy() operations.

I implemented this approach in v3 of this patchset here:
https://lore.kernel.org/all/20241117080931.600731-5-surenb@google.com/
like this:

#define VMA_BEFORE_LOCK offsetof(struct vm_area_struct, vm_lock)
#define VMA_LOCK_END(vma) \
        (((void *)(vma)) + offsetofend(struct vm_area_struct, vm_lock))
#define VMA_AFTER_LOCK \
        (sizeof(struct vm_area_struct) - offsetofend(struct
vm_area_struct, vm_lock))

static inline void vma_copy(struct vm_area_struct *new, struct
vm_area_struct *orig)
{
        /* Preserve vma->vm_lock */
        data_race(memcpy(new, orig, VMA_BEFORE_LOCK));
        data_race(memcpy(VMA_LOCK_END(new), VMA_LOCK_END(orig),
VMA_AFTER_LOCK));
}

If this looks more maintainable I can revive it.
Maybe introduce a more generic function to copy any structure
excluding a specific field and use it like this:

copy_struct_except(new, orig, struct vm_area_struct, vm_refcnt);

Would that be better?

>
> Feel free to disregard these ideas as it is late here and I'm having
> fun thinking up bad ways to make this "more" maintainable.
>
> Either of these would make updating the struct easier, but very painful
> to debug when it goes wrong (or reading the function).
>
> Thanks,
> Liam