Right now these are performed in kernel/fork.c which is odd and a violation
of separation of concerns, as well as preventing us from integrating this
and related logic into userland VMA testing going forward, and perhaps more
importantly - enabling us to, in a subsequent commit, make VMA
allocation/freeing a purely internal mm operation.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.
To square the circle, let's add a new file - vma_init.c. This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.
This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.
Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
MAINTAINERS | 1 +
kernel/fork.c | 88 -------------------
mm/Makefile | 2 +-
mm/mmap.c | 3 +-
mm/nommu.c | 4 +-
mm/vma.h | 7 ++
mm/vma_init.c | 101 ++++++++++++++++++++++
tools/testing/vma/Makefile | 2 +-
tools/testing/vma/vma.c | 26 ++++--
tools/testing/vma/vma_internal.h | 143 +++++++++++++++++++++++++------
10 files changed, 251 insertions(+), 126 deletions(-)
create mode 100644 mm/vma_init.c
diff --git a/MAINTAINERS b/MAINTAINERS
index 1ee1c22e6e36..d274e6802ba5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -15656,6 +15656,7 @@ F: mm/mseal.c
F: mm/vma.c
F: mm/vma.h
F: mm/vma_exec.c
+F: mm/vma_init.c
F: mm/vma_internal.h
F: tools/testing/selftests/mm/merge.c
F: tools/testing/vma/
diff --git a/kernel/fork.c b/kernel/fork.c
index ac9f9267a473..9e4616dacd82 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -431,88 +431,9 @@ struct kmem_cache *files_cachep;
/* SLAB cache for fs_struct structures (tsk->fs) */
struct kmem_cache *fs_cachep;
-/* SLAB cache for vm_area_struct structures */
-static struct kmem_cache *vm_area_cachep;
-
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
-struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
-{
- struct vm_area_struct *vma;
-
- vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (!vma)
- return NULL;
-
- vma_init(vma, mm);
-
- return vma;
-}
-
-static void vm_area_init_from(const struct vm_area_struct *src,
- struct vm_area_struct *dest)
-{
- dest->vm_mm = src->vm_mm;
- dest->vm_ops = src->vm_ops;
- dest->vm_start = src->vm_start;
- dest->vm_end = src->vm_end;
- dest->anon_vma = src->anon_vma;
- dest->vm_pgoff = src->vm_pgoff;
- dest->vm_file = src->vm_file;
- dest->vm_private_data = src->vm_private_data;
- vm_flags_init(dest, src->vm_flags);
- memcpy(&dest->vm_page_prot, &src->vm_page_prot,
- sizeof(dest->vm_page_prot));
- /*
- * src->shared.rb may be modified concurrently when called from
- * dup_mmap(), but the clone will reinitialize it.
- */
- data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
- memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
- sizeof(dest->vm_userfaultfd_ctx));
-#ifdef CONFIG_ANON_VMA_NAME
- dest->anon_name = src->anon_name;
-#endif
-#ifdef CONFIG_SWAP
- memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
- sizeof(dest->swap_readahead_info));
-#endif
-#ifndef CONFIG_MMU
- dest->vm_region = src->vm_region;
-#endif
-#ifdef CONFIG_NUMA
- dest->vm_policy = src->vm_policy;
-#endif
-}
-
-struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
-{
- struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-
- if (!new)
- return NULL;
-
- ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
- ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
- vm_area_init_from(orig, new);
- vma_lock_init(new, true);
- INIT_LIST_HEAD(&new->anon_vma_chain);
- vma_numab_state_init(new);
- dup_anon_vma_name(orig, new);
-
- return new;
-}
-
-void vm_area_free(struct vm_area_struct *vma)
-{
- /* The vma should be detached while being destroyed. */
- vma_assert_detached(vma);
- vma_numab_state_free(vma);
- free_anon_vma_name(vma);
- kmem_cache_free(vm_area_cachep, vma);
-}
-
static void account_kernel_stack(struct task_struct *tsk, int account)
{
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@ -3033,11 +2954,6 @@ void __init mm_cache_init(void)
void __init proc_caches_init(void)
{
- struct kmem_cache_args args = {
- .use_freeptr_offset = true,
- .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
- };
-
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -3054,10 +2970,6 @@ void __init proc_caches_init(void)
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
- vm_area_cachep = kmem_cache_create("vm_area_struct",
- sizeof(struct vm_area_struct), &args,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
- SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();
}
diff --git a/mm/Makefile b/mm/Makefile
index 15a901bb431a..690ddcf7d9a1 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -55,7 +55,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o percpu.o slab_common.o \
compaction.o show_mem.o \
interval_tree.o list_lru.o workingset.o \
- debug.o gup.o mmap_lock.o $(mmu-y)
+ debug.o gup.o mmap_lock.o vma_init.o $(mmu-y)
# Give 'page_alloc' its own module-parameter namespace
page-alloc-y := page_alloc.o
diff --git a/mm/mmap.c b/mm/mmap.c
index 5259df031e15..81dd962a1cfc 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1554,7 +1554,7 @@ static const struct ctl_table mmap_table[] = {
#endif /* CONFIG_SYSCTL */
/*
- * initialise the percpu counter for VM
+ * initialise the percpu counter for VM, initialise VMA state.
*/
void __init mmap_init(void)
{
@@ -1565,6 +1565,7 @@ void __init mmap_init(void)
#ifdef CONFIG_SYSCTL
register_sysctl_init("vm", mmap_table);
#endif
+ vma_state_init();
}
/*
diff --git a/mm/nommu.c b/mm/nommu.c
index a142fc258d39..0bf4849b8204 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -399,7 +399,8 @@ static const struct ctl_table nommu_table[] = {
};
/*
- * initialise the percpu counter for VM and region record slabs
+ * initialise the percpu counter for VM and region record slabs, initialise VMA
+ * state.
*/
void __init mmap_init(void)
{
@@ -409,6 +410,7 @@ void __init mmap_init(void)
VM_BUG_ON(ret);
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
register_sysctl_init("vm", nommu_table);
+ vma_state_init();
}
/*
diff --git a/mm/vma.h b/mm/vma.h
index 94307a2e4ab6..4a1e1768ca46 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -548,8 +548,15 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address);
int __vm_munmap(unsigned long start, size_t len, bool unlock);
+
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma);
+/* vma_init.h, shared between CONFIG_MMU and nommu. */
+void __init vma_state_init(void);
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm);
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig);
+void vm_area_free(struct vm_area_struct *vma);
+
/* vma_exec.h */
#ifdef CONFIG_MMU
int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
diff --git a/mm/vma_init.c b/mm/vma_init.c
new file mode 100644
index 000000000000..967ca8517986
--- /dev/null
+++ b/mm/vma_init.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/*
+ * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared
+ * between CONFIG_MMU and non-CONFIG_MMU kernel configurations.
+ */
+
+#include "vma_internal.h"
+#include "vma.h"
+
+/* SLAB cache for vm_area_struct structures */
+static struct kmem_cache *vm_area_cachep;
+
+void __init vma_state_init(void)
+{
+ struct kmem_cache_args args = {
+ .use_freeptr_offset = true,
+ .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
+ };
+
+ vm_area_cachep = kmem_cache_create("vm_area_struct",
+ sizeof(struct vm_area_struct), &args,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
+ SLAB_ACCOUNT);
+}
+
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ if (!vma)
+ return NULL;
+
+ vma_init(vma, mm);
+
+ return vma;
+}
+
+static void vm_area_init_from(const struct vm_area_struct *src,
+ struct vm_area_struct *dest)
+{
+ dest->vm_mm = src->vm_mm;
+ dest->vm_ops = src->vm_ops;
+ dest->vm_start = src->vm_start;
+ dest->vm_end = src->vm_end;
+ dest->anon_vma = src->anon_vma;
+ dest->vm_pgoff = src->vm_pgoff;
+ dest->vm_file = src->vm_file;
+ dest->vm_private_data = src->vm_private_data;
+ vm_flags_init(dest, src->vm_flags);
+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
+ sizeof(dest->vm_page_prot));
+ /*
+ * src->shared.rb may be modified concurrently when called from
+ * dup_mmap(), but the clone will reinitialize it.
+ */
+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
+ memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
+ sizeof(dest->vm_userfaultfd_ctx));
+#ifdef CONFIG_ANON_VMA_NAME
+ dest->anon_name = src->anon_name;
+#endif
+#ifdef CONFIG_SWAP
+ memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
+ sizeof(dest->swap_readahead_info));
+#endif
+#ifndef CONFIG_MMU
+ dest->vm_region = src->vm_region;
+#endif
+#ifdef CONFIG_NUMA
+ dest->vm_policy = src->vm_policy;
+#endif
+}
+
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
+{
+ struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+
+ if (!new)
+ return NULL;
+
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ vm_area_init_from(orig, new);
+ vma_lock_init(new, true);
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+ vma_numab_state_init(new);
+ dup_anon_vma_name(orig, new);
+
+ return new;
+}
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+ /* The vma should be detached while being destroyed. */
+ vma_assert_detached(vma);
+ vma_numab_state_free(vma);
+ free_anon_vma_name(vma);
+ kmem_cache_free(vm_area_cachep, vma);
+}
diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile
index 624040fcf193..66f3831a668f 100644
--- a/tools/testing/vma/Makefile
+++ b/tools/testing/vma/Makefile
@@ -9,7 +9,7 @@ include ../shared/shared.mk
OFILES = $(SHARED_OFILES) vma.o maple-shim.o
TARGETS = vma
-vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_exec.c ../../../mm/vma.h
+vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h
vma: $(OFILES)
$(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index 5832ae5d797d..2be7597a2ac2 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -28,6 +28,7 @@ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
* Directly import the VMA implementation here. Our vma_internal.h wrapper
* provides userland-equivalent functionality for everything vma.c uses.
*/
+#include "../../../mm/vma_init.c"
#include "../../../mm/vma_exec.c"
#include "../../../mm/vma.c"
@@ -91,6 +92,12 @@ static int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma)
return res;
}
+static void detach_free_vma(struct vm_area_struct *vma)
+{
+ vma_mark_detached(vma);
+ vm_area_free(vma);
+}
+
/* Helper function to allocate a VMA and link it to the tree. */
static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
unsigned long start,
@@ -104,7 +111,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
return NULL;
if (attach_vma(mm, vma)) {
- vm_area_free(vma);
+ detach_free_vma(vma);
return NULL;
}
@@ -249,7 +256,7 @@ static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi)
vma_iter_set(vmi, 0);
for_each_vma(*vmi, vma) {
- vm_area_free(vma);
+ detach_free_vma(vma);
count++;
}
@@ -319,7 +326,7 @@ static bool test_simple_merge(void)
ASSERT_EQ(vma->vm_pgoff, 0);
ASSERT_EQ(vma->vm_flags, flags);
- vm_area_free(vma);
+ detach_free_vma(vma);
mtree_destroy(&mm.mm_mt);
return true;
@@ -361,7 +368,7 @@ static bool test_simple_modify(void)
ASSERT_EQ(vma->vm_end, 0x1000);
ASSERT_EQ(vma->vm_pgoff, 0);
- vm_area_free(vma);
+ detach_free_vma(vma);
vma_iter_clear(&vmi);
vma = vma_next(&vmi);
@@ -370,7 +377,7 @@ static bool test_simple_modify(void)
ASSERT_EQ(vma->vm_end, 0x2000);
ASSERT_EQ(vma->vm_pgoff, 1);
- vm_area_free(vma);
+ detach_free_vma(vma);
vma_iter_clear(&vmi);
vma = vma_next(&vmi);
@@ -379,7 +386,7 @@ static bool test_simple_modify(void)
ASSERT_EQ(vma->vm_end, 0x3000);
ASSERT_EQ(vma->vm_pgoff, 2);
- vm_area_free(vma);
+ detach_free_vma(vma);
mtree_destroy(&mm.mm_mt);
return true;
@@ -407,7 +414,7 @@ static bool test_simple_expand(void)
ASSERT_EQ(vma->vm_end, 0x3000);
ASSERT_EQ(vma->vm_pgoff, 0);
- vm_area_free(vma);
+ detach_free_vma(vma);
mtree_destroy(&mm.mm_mt);
return true;
@@ -428,7 +435,7 @@ static bool test_simple_shrink(void)
ASSERT_EQ(vma->vm_end, 0x1000);
ASSERT_EQ(vma->vm_pgoff, 0);
- vm_area_free(vma);
+ detach_free_vma(vma);
mtree_destroy(&mm.mm_mt);
return true;
@@ -619,7 +626,7 @@ static bool test_merge_new(void)
ASSERT_EQ(vma->vm_pgoff, 0);
ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
- vm_area_free(vma);
+ detach_free_vma(vma);
count++;
}
@@ -1668,6 +1675,7 @@ int main(void)
int num_tests = 0, num_fail = 0;
maple_tree_init();
+ vma_state_init();
#define TEST(name) \
do { \
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 32e990313158..198abe66de5a 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -155,6 +155,10 @@ typedef __bitwise unsigned int vm_fault_t;
*/
#define pr_warn_once pr_err
+#define data_race(expr) expr
+
+#define ASSERT_EXCLUSIVE_WRITER(x)
+
struct kref {
refcount_t refcount;
};
@@ -255,6 +259,8 @@ struct file {
#define VMA_LOCK_OFFSET 0x40000000
+typedef struct { unsigned long v; } freeptr_t;
+
struct vm_area_struct {
/* The first cache line has the info for VMA tree walking. */
@@ -264,9 +270,7 @@ struct vm_area_struct {
unsigned long vm_start;
unsigned long vm_end;
};
-#ifdef CONFIG_PER_VMA_LOCK
- struct rcu_head vm_rcu; /* Used for deferred freeing. */
-#endif
+ freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
};
struct mm_struct *vm_mm; /* The address space we belong to. */
@@ -463,6 +467,65 @@ struct pagetable_move_control {
.len_in = len_, \
}
+struct kmem_cache_args {
+ /**
+ * @align: The required alignment for the objects.
+ *
+ * %0 means no specific alignment is requested.
+ */
+ unsigned int align;
+ /**
+ * @useroffset: Usercopy region offset.
+ *
+ * %0 is a valid offset, when @usersize is non-%0
+ */
+ unsigned int useroffset;
+ /**
+ * @usersize: Usercopy region size.
+ *
+ * %0 means no usercopy region is specified.
+ */
+ unsigned int usersize;
+ /**
+ * @freeptr_offset: Custom offset for the free pointer
+ * in &SLAB_TYPESAFE_BY_RCU caches
+ *
+ * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer
+ * outside of the object. This might cause the object to grow in size.
+ * Cache creators that have a reason to avoid this can specify a custom
+ * free pointer offset in their struct where the free pointer will be
+ * placed.
+ *
+ * Note that placing the free pointer inside the object requires the
+ * caller to ensure that no fields are invalidated that are required to
+ * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for
+ * details).
+ *
+ * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset
+ * is specified, %use_freeptr_offset must be set %true.
+ *
+ * Note that @ctor currently isn't supported with custom free pointers
+ * as a @ctor requires an external free pointer.
+ */
+ unsigned int freeptr_offset;
+ /**
+ * @use_freeptr_offset: Whether a @freeptr_offset is used.
+ */
+ bool use_freeptr_offset;
+ /**
+ * @ctor: A constructor for the objects.
+ *
+ * The constructor is invoked for each object in a newly allocated slab
+ * page. It is the cache user's responsibility to free object in the
+ * same state as after calling the constructor, or deal appropriately
+ * with any differences between a freshly constructed and a reallocated
+ * object.
+ *
+ * %NULL means no constructor.
+ */
+ void (*ctor)(void *);
+};
+
static inline void vma_iter_invalidate(struct vma_iterator *vmi)
{
mas_pause(&vmi->mas);
@@ -547,31 +610,38 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
vma->vm_lock_seq = UINT_MAX;
}
-static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
-{
- struct vm_area_struct *vma = calloc(1, sizeof(struct vm_area_struct));
+struct kmem_cache {
+ const char *name;
+ size_t object_size;
+ struct kmem_cache_args *args;
+};
- if (!vma)
- return NULL;
+static inline struct kmem_cache *__kmem_cache_create(const char *name,
+ size_t object_size,
+ struct kmem_cache_args *args)
+{
+ struct kmem_cache *ret = malloc(sizeof(struct kmem_cache));
- vma_init(vma, mm);
+ ret->name = name;
+ ret->object_size = object_size;
+ ret->args = args;
- return vma;
+ return ret;
}
-static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
-{
- struct vm_area_struct *new = calloc(1, sizeof(struct vm_area_struct));
+#define kmem_cache_create(__name, __object_size, __args, ...) \
+ __kmem_cache_create((__name), (__object_size), (__args))
- if (!new)
- return NULL;
+static inline void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
+{
+ (void)gfpflags;
- memcpy(new, orig, sizeof(*new));
- refcount_set(&new->vm_refcnt, 0);
- new->vm_lock_seq = UINT_MAX;
- INIT_LIST_HEAD(&new->anon_vma_chain);
+ return calloc(s->object_size, 1);
+}
- return new;
+static inline void kmem_cache_free(struct kmem_cache *s, void *x)
+{
+ free(x);
}
/*
@@ -738,11 +808,6 @@ static inline void mpol_put(struct mempolicy *)
{
}
-static inline void vm_area_free(struct vm_area_struct *vma)
-{
- free(vma);
-}
-
static inline void lru_add_drain(void)
{
}
@@ -1312,4 +1377,32 @@ static inline void ksm_exit(struct mm_struct *mm)
(void)mm;
}
+static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
+{
+ (void)vma;
+ (void)reset_refcnt;
+}
+
+static inline void vma_numab_state_init(struct vm_area_struct *vma)
+{
+ (void)vma;
+}
+
+static inline void vma_numab_state_free(struct vm_area_struct *vma)
+{
+ (void)vma;
+}
+
+static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
+ struct vm_area_struct *new_vma)
+{
+ (void)orig_vma;
+ (void)new_vma;
+}
+
+static inline void free_anon_vma_name(struct vm_area_struct *vma)
+{
+ (void)vma;
+}
+
#endif /* __MM_VMA_INTERNAL_H */
--
2.49.0
On Mon, Apr 28, 2025 at 04:28:17PM +0100, Lorenzo Stoakes wrote: > Right now these are performed in kernel/fork.c which is odd and a violation > of separation of concerns, as well as preventing us from integrating this > and related logic into userland VMA testing going forward, and perhaps more > importantly - enabling us to, in a subsequent commit, make VMA > allocation/freeing a purely internal mm operation. > > There is a fly in the ointment - nommu - mmap.c is not compiled if > CONFIG_MMU not set, and neither is vma.c. > > To square the circle, let's add a new file - vma_init.c. This will be > compiled for both CONFIG_MMU and nommu builds, and will also form part of > the VMA userland testing. > > This allows us to de-duplicate code, while maintaining separation of > concerns and the ability for us to userland test this logic. > > Update the VMA userland tests accordingly, additionally adding a > detach_free_vma() helper function to correctly detach VMAs before freeing > them in test code, as this change was triggering the assert for this. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Reviewed-by: Pedro Falcato <pfalcato@suse.de> -- Pedro
On 28.04.25 17:28, Lorenzo Stoakes wrote: > Right now these are performed in kernel/fork.c which is odd and a violation > of separation of concerns, as well as preventing us from integrating this > and related logic into userland VMA testing going forward, and perhaps more > importantly - enabling us to, in a subsequent commit, make VMA > allocation/freeing a purely internal mm operation. > > There is a fly in the ointment - nommu - mmap.c is not compiled if > CONFIG_MMU not set, and neither is vma.c. > > To square the circle, let's add a new file - vma_init.c. This will be > compiled for both CONFIG_MMU and nommu builds, and will also form part of > the VMA userland testing. > > This allows us to de-duplicate code, while maintaining separation of > concerns and the ability for us to userland test this logic. > > Update the VMA userland tests accordingly, additionally adding a > detach_free_vma() helper function to correctly detach VMAs before freeing > them in test code, as this change was triggering the assert for this. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > --- Reviewed-by: David Hildenbrand <david@redhat.com> -- Cheers, David / dhildenb
On Mon, Apr 28, 2025 at 04:28:17PM +0100, Lorenzo Stoakes wrote: > Right now these are performed in kernel/fork.c which is odd and a violation > of separation of concerns, as well as preventing us from integrating this > and related logic into userland VMA testing going forward, and perhaps more > importantly - enabling us to, in a subsequent commit, make VMA > allocation/freeing a purely internal mm operation. > > There is a fly in the ointment - nommu - mmap.c is not compiled if > CONFIG_MMU not set, and neither is vma.c. > > To square the circle, let's add a new file - vma_init.c. This will be > compiled for both CONFIG_MMU and nommu builds, and will also form part of > the VMA userland testing. > > This allows us to de-duplicate code, while maintaining separation of > concerns and the ability for us to userland test this logic. > > Update the VMA userland tests accordingly, additionally adding a > detach_free_vma() helper function to correctly detach VMAs before freeing > them in test code, as this change was triggering the assert for this. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Reviewed-by: Kees Cook <kees@kernel.org> -- Kees Cook
On Mon, Apr 28, 2025 at 8:31 AM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> Right now these are performed in kernel/fork.c which is odd and a violation
> of separation of concerns, as well as preventing us from integrating this
> and related logic into userland VMA testing going forward, and perhaps more
> importantly - enabling us to, in a subsequent commit, make VMA
> allocation/freeing a purely internal mm operation.
>
> There is a fly in the ointment - nommu - mmap.c is not compiled if
> CONFIG_MMU not set, and neither is vma.c.
>
> To square the circle, let's add a new file - vma_init.c. This will be
> compiled for both CONFIG_MMU and nommu builds, and will also form part of
> the VMA userland testing.
>
> This allows us to de-duplicate code, while maintaining separation of
> concerns and the ability for us to userland test this logic.
>
> Update the VMA userland tests accordingly, additionally adding a
> detach_free_vma() helper function to correctly detach VMAs before freeing
> them in test code, as this change was triggering the assert for this.
Great! We are getting closer to parity between tests and the kernel code.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> ---
> MAINTAINERS | 1 +
> kernel/fork.c | 88 -------------------
> mm/Makefile | 2 +-
> mm/mmap.c | 3 +-
> mm/nommu.c | 4 +-
> mm/vma.h | 7 ++
> mm/vma_init.c | 101 ++++++++++++++++++++++
> tools/testing/vma/Makefile | 2 +-
> tools/testing/vma/vma.c | 26 ++++--
> tools/testing/vma/vma_internal.h | 143 +++++++++++++++++++++++++------
> 10 files changed, 251 insertions(+), 126 deletions(-)
> create mode 100644 mm/vma_init.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1ee1c22e6e36..d274e6802ba5 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -15656,6 +15656,7 @@ F: mm/mseal.c
> F: mm/vma.c
> F: mm/vma.h
> F: mm/vma_exec.c
> +F: mm/vma_init.c
> F: mm/vma_internal.h
> F: tools/testing/selftests/mm/merge.c
> F: tools/testing/vma/
> diff --git a/kernel/fork.c b/kernel/fork.c
> index ac9f9267a473..9e4616dacd82 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -431,88 +431,9 @@ struct kmem_cache *files_cachep;
> /* SLAB cache for fs_struct structures (tsk->fs) */
> struct kmem_cache *fs_cachep;
>
> -/* SLAB cache for vm_area_struct structures */
> -static struct kmem_cache *vm_area_cachep;
> -
> /* SLAB cache for mm_struct structures (tsk->mm) */
> static struct kmem_cache *mm_cachep;
Maybe at some point we will be able to move mm_cachep out of here as well?
>
> -struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> -{
> - struct vm_area_struct *vma;
> -
> - vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> - if (!vma)
> - return NULL;
> -
> - vma_init(vma, mm);
> -
> - return vma;
> -}
> -
> -static void vm_area_init_from(const struct vm_area_struct *src,
> - struct vm_area_struct *dest)
> -{
> - dest->vm_mm = src->vm_mm;
> - dest->vm_ops = src->vm_ops;
> - dest->vm_start = src->vm_start;
> - dest->vm_end = src->vm_end;
> - dest->anon_vma = src->anon_vma;
> - dest->vm_pgoff = src->vm_pgoff;
> - dest->vm_file = src->vm_file;
> - dest->vm_private_data = src->vm_private_data;
> - vm_flags_init(dest, src->vm_flags);
> - memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> - sizeof(dest->vm_page_prot));
> - /*
> - * src->shared.rb may be modified concurrently when called from
> - * dup_mmap(), but the clone will reinitialize it.
> - */
> - data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> - memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> - sizeof(dest->vm_userfaultfd_ctx));
> -#ifdef CONFIG_ANON_VMA_NAME
> - dest->anon_name = src->anon_name;
> -#endif
> -#ifdef CONFIG_SWAP
> - memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> - sizeof(dest->swap_readahead_info));
> -#endif
> -#ifndef CONFIG_MMU
> - dest->vm_region = src->vm_region;
> -#endif
> -#ifdef CONFIG_NUMA
> - dest->vm_policy = src->vm_policy;
> -#endif
> -}
> -
> -struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> -{
> - struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> -
> - if (!new)
> - return NULL;
> -
> - ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
> - ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
> - vm_area_init_from(orig, new);
> - vma_lock_init(new, true);
> - INIT_LIST_HEAD(&new->anon_vma_chain);
> - vma_numab_state_init(new);
> - dup_anon_vma_name(orig, new);
> -
> - return new;
> -}
> -
> -void vm_area_free(struct vm_area_struct *vma)
> -{
> - /* The vma should be detached while being destroyed. */
> - vma_assert_detached(vma);
> - vma_numab_state_free(vma);
> - free_anon_vma_name(vma);
> - kmem_cache_free(vm_area_cachep, vma);
> -}
> -
> static void account_kernel_stack(struct task_struct *tsk, int account)
> {
> if (IS_ENABLED(CONFIG_VMAP_STACK)) {
> @@ -3033,11 +2954,6 @@ void __init mm_cache_init(void)
>
> void __init proc_caches_init(void)
> {
> - struct kmem_cache_args args = {
> - .use_freeptr_offset = true,
> - .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
> - };
> -
> sighand_cachep = kmem_cache_create("sighand_cache",
> sizeof(struct sighand_struct), 0,
> SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
> @@ -3054,10 +2970,6 @@ void __init proc_caches_init(void)
> sizeof(struct fs_struct), 0,
> SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> NULL);
> - vm_area_cachep = kmem_cache_create("vm_area_struct",
> - sizeof(struct vm_area_struct), &args,
> - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
> - SLAB_ACCOUNT);
> mmap_init();
> nsproxy_cache_init();
> }
> diff --git a/mm/Makefile b/mm/Makefile
> index 15a901bb431a..690ddcf7d9a1 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -55,7 +55,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
> mm_init.o percpu.o slab_common.o \
> compaction.o show_mem.o \
> interval_tree.o list_lru.o workingset.o \
> - debug.o gup.o mmap_lock.o $(mmu-y)
> + debug.o gup.o mmap_lock.o vma_init.o $(mmu-y)
>
> # Give 'page_alloc' its own module-parameter namespace
> page-alloc-y := page_alloc.o
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 5259df031e15..81dd962a1cfc 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1554,7 +1554,7 @@ static const struct ctl_table mmap_table[] = {
> #endif /* CONFIG_SYSCTL */
>
> /*
> - * initialise the percpu counter for VM
> + * initialise the percpu counter for VM, initialise VMA state.
> */
> void __init mmap_init(void)
> {
> @@ -1565,6 +1565,7 @@ void __init mmap_init(void)
> #ifdef CONFIG_SYSCTL
> register_sysctl_init("vm", mmap_table);
> #endif
> + vma_state_init();
> }
>
> /*
> diff --git a/mm/nommu.c b/mm/nommu.c
> index a142fc258d39..0bf4849b8204 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -399,7 +399,8 @@ static const struct ctl_table nommu_table[] = {
> };
>
> /*
> - * initialise the percpu counter for VM and region record slabs
> + * initialise the percpu counter for VM and region record slabs, initialise VMA
> + * state.
> */
> void __init mmap_init(void)
> {
> @@ -409,6 +410,7 @@ void __init mmap_init(void)
> VM_BUG_ON(ret);
> vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
> register_sysctl_init("vm", nommu_table);
> + vma_state_init();
> }
>
> /*
> diff --git a/mm/vma.h b/mm/vma.h
> index 94307a2e4ab6..4a1e1768ca46 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -548,8 +548,15 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address);
>
> int __vm_munmap(unsigned long start, size_t len, bool unlock);
>
> +
> int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma);
>
> +/* vma_init.h, shared between CONFIG_MMU and nommu. */
> +void __init vma_state_init(void);
> +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm);
> +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig);
> +void vm_area_free(struct vm_area_struct *vma);
> +
> /* vma_exec.h */
> #ifdef CONFIG_MMU
> int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
> diff --git a/mm/vma_init.c b/mm/vma_init.c
> new file mode 100644
> index 000000000000..967ca8517986
> --- /dev/null
> +++ b/mm/vma_init.c
> @@ -0,0 +1,101 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +/*
> + * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared
> + * between CONFIG_MMU and non-CONFIG_MMU kernel configurations.
> + */
> +
> +#include "vma_internal.h"
> +#include "vma.h"
> +
> +/* SLAB cache for vm_area_struct structures */
> +static struct kmem_cache *vm_area_cachep;
> +
> +void __init vma_state_init(void)
> +{
> + struct kmem_cache_args args = {
> + .use_freeptr_offset = true,
> + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
> + };
> +
> + vm_area_cachep = kmem_cache_create("vm_area_struct",
> + sizeof(struct vm_area_struct), &args,
> + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
> + SLAB_ACCOUNT);
> +}
> +
> +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> +{
> + struct vm_area_struct *vma;
> +
> + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> + if (!vma)
> + return NULL;
> +
> + vma_init(vma, mm);
> +
> + return vma;
> +}
> +
> +static void vm_area_init_from(const struct vm_area_struct *src,
> + struct vm_area_struct *dest)
> +{
> + dest->vm_mm = src->vm_mm;
> + dest->vm_ops = src->vm_ops;
> + dest->vm_start = src->vm_start;
> + dest->vm_end = src->vm_end;
> + dest->anon_vma = src->anon_vma;
> + dest->vm_pgoff = src->vm_pgoff;
> + dest->vm_file = src->vm_file;
> + dest->vm_private_data = src->vm_private_data;
> + vm_flags_init(dest, src->vm_flags);
> + memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> + sizeof(dest->vm_page_prot));
> + /*
> + * src->shared.rb may be modified concurrently when called from
> + * dup_mmap(), but the clone will reinitialize it.
> + */
> + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> + sizeof(dest->vm_userfaultfd_ctx));
> +#ifdef CONFIG_ANON_VMA_NAME
> + dest->anon_name = src->anon_name;
> +#endif
> +#ifdef CONFIG_SWAP
> + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> + sizeof(dest->swap_readahead_info));
> +#endif
> +#ifndef CONFIG_MMU
> + dest->vm_region = src->vm_region;
> +#endif
> +#ifdef CONFIG_NUMA
> + dest->vm_policy = src->vm_policy;
> +#endif
> +}
> +
> +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> +{
> + struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> +
> + if (!new)
> + return NULL;
> +
> + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
> + ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
> + vm_area_init_from(orig, new);
> + vma_lock_init(new, true);
> + INIT_LIST_HEAD(&new->anon_vma_chain);
> + vma_numab_state_init(new);
> + dup_anon_vma_name(orig, new);
> +
> + return new;
> +}
> +
> +void vm_area_free(struct vm_area_struct *vma)
> +{
> + /* The vma should be detached while being destroyed. */
> + vma_assert_detached(vma);
> + vma_numab_state_free(vma);
> + free_anon_vma_name(vma);
> + kmem_cache_free(vm_area_cachep, vma);
> +}
> diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile
> index 624040fcf193..66f3831a668f 100644
> --- a/tools/testing/vma/Makefile
> +++ b/tools/testing/vma/Makefile
> @@ -9,7 +9,7 @@ include ../shared/shared.mk
> OFILES = $(SHARED_OFILES) vma.o maple-shim.o
> TARGETS = vma
>
> -vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_exec.c ../../../mm/vma.h
> +vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h
>
> vma: $(OFILES)
> $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)
> diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
> index 5832ae5d797d..2be7597a2ac2 100644
> --- a/tools/testing/vma/vma.c
> +++ b/tools/testing/vma/vma.c
> @@ -28,6 +28,7 @@ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
> * Directly import the VMA implementation here. Our vma_internal.h wrapper
> * provides userland-equivalent functionality for everything vma.c uses.
> */
> +#include "../../../mm/vma_init.c"
> #include "../../../mm/vma_exec.c"
> #include "../../../mm/vma.c"
>
> @@ -91,6 +92,12 @@ static int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma)
> return res;
> }
>
> +static void detach_free_vma(struct vm_area_struct *vma)
In case you respin another version, I think this change related to
detach_free_vma() would better be done in a separate patch. But I'm
totally fine with the way it is now, so don't respin just for that.
> +{
> + vma_mark_detached(vma);
> + vm_area_free(vma);
> +}
> +
> /* Helper function to allocate a VMA and link it to the tree. */
> static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
> unsigned long start,
> @@ -104,7 +111,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
> return NULL;
>
> if (attach_vma(mm, vma)) {
> - vm_area_free(vma);
> + detach_free_vma(vma);
> return NULL;
> }
>
> @@ -249,7 +256,7 @@ static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi)
>
> vma_iter_set(vmi, 0);
> for_each_vma(*vmi, vma) {
> - vm_area_free(vma);
> + detach_free_vma(vma);
> count++;
> }
>
> @@ -319,7 +326,7 @@ static bool test_simple_merge(void)
> ASSERT_EQ(vma->vm_pgoff, 0);
> ASSERT_EQ(vma->vm_flags, flags);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> mtree_destroy(&mm.mm_mt);
>
> return true;
> @@ -361,7 +368,7 @@ static bool test_simple_modify(void)
> ASSERT_EQ(vma->vm_end, 0x1000);
> ASSERT_EQ(vma->vm_pgoff, 0);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> vma_iter_clear(&vmi);
>
> vma = vma_next(&vmi);
> @@ -370,7 +377,7 @@ static bool test_simple_modify(void)
> ASSERT_EQ(vma->vm_end, 0x2000);
> ASSERT_EQ(vma->vm_pgoff, 1);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> vma_iter_clear(&vmi);
>
> vma = vma_next(&vmi);
> @@ -379,7 +386,7 @@ static bool test_simple_modify(void)
> ASSERT_EQ(vma->vm_end, 0x3000);
> ASSERT_EQ(vma->vm_pgoff, 2);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> mtree_destroy(&mm.mm_mt);
>
> return true;
> @@ -407,7 +414,7 @@ static bool test_simple_expand(void)
> ASSERT_EQ(vma->vm_end, 0x3000);
> ASSERT_EQ(vma->vm_pgoff, 0);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> mtree_destroy(&mm.mm_mt);
>
> return true;
> @@ -428,7 +435,7 @@ static bool test_simple_shrink(void)
> ASSERT_EQ(vma->vm_end, 0x1000);
> ASSERT_EQ(vma->vm_pgoff, 0);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> mtree_destroy(&mm.mm_mt);
>
> return true;
> @@ -619,7 +626,7 @@ static bool test_merge_new(void)
> ASSERT_EQ(vma->vm_pgoff, 0);
> ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> count++;
> }
>
> @@ -1668,6 +1675,7 @@ int main(void)
> int num_tests = 0, num_fail = 0;
>
> maple_tree_init();
> + vma_state_init();
>
> #define TEST(name) \
> do { \
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index 32e990313158..198abe66de5a 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -155,6 +155,10 @@ typedef __bitwise unsigned int vm_fault_t;
> */
> #define pr_warn_once pr_err
>
> +#define data_race(expr) expr
> +
> +#define ASSERT_EXCLUSIVE_WRITER(x)
> +
> struct kref {
> refcount_t refcount;
> };
> @@ -255,6 +259,8 @@ struct file {
>
> #define VMA_LOCK_OFFSET 0x40000000
>
> +typedef struct { unsigned long v; } freeptr_t;
> +
> struct vm_area_struct {
> /* The first cache line has the info for VMA tree walking. */
>
> @@ -264,9 +270,7 @@ struct vm_area_struct {
> unsigned long vm_start;
> unsigned long vm_end;
> };
> -#ifdef CONFIG_PER_VMA_LOCK
> - struct rcu_head vm_rcu; /* Used for deferred freeing. */
> -#endif
> + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
Oops, I must have missed this when adding SLAB_TYPESAFE_BY_RCU to the
vm_area_struct cache. Thanks for fixing it!
> };
>
> struct mm_struct *vm_mm; /* The address space we belong to. */
> @@ -463,6 +467,65 @@ struct pagetable_move_control {
> .len_in = len_, \
> }
>
> +struct kmem_cache_args {
> + /**
> + * @align: The required alignment for the objects.
> + *
> + * %0 means no specific alignment is requested.
> + */
> + unsigned int align;
> + /**
> + * @useroffset: Usercopy region offset.
> + *
> + * %0 is a valid offset, when @usersize is non-%0
> + */
> + unsigned int useroffset;
> + /**
> + * @usersize: Usercopy region size.
> + *
> + * %0 means no usercopy region is specified.
> + */
> + unsigned int usersize;
> + /**
> + * @freeptr_offset: Custom offset for the free pointer
> + * in &SLAB_TYPESAFE_BY_RCU caches
> + *
> + * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer
> + * outside of the object. This might cause the object to grow in size.
> + * Cache creators that have a reason to avoid this can specify a custom
> + * free pointer offset in their struct where the free pointer will be
> + * placed.
> + *
> + * Note that placing the free pointer inside the object requires the
> + * caller to ensure that no fields are invalidated that are required to
> + * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for
> + * details).
> + *
> + * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset
> + * is specified, %use_freeptr_offset must be set %true.
> + *
> + * Note that @ctor currently isn't supported with custom free pointers
> + * as a @ctor requires an external free pointer.
> + */
> + unsigned int freeptr_offset;
> + /**
> + * @use_freeptr_offset: Whether a @freeptr_offset is used.
> + */
> + bool use_freeptr_offset;
> + /**
> + * @ctor: A constructor for the objects.
> + *
> + * The constructor is invoked for each object in a newly allocated slab
> + * page. It is the cache user's responsibility to free object in the
> + * same state as after calling the constructor, or deal appropriately
> + * with any differences between a freshly constructed and a reallocated
> + * object.
> + *
> + * %NULL means no constructor.
> + */
> + void (*ctor)(void *);
> +};
> +
> static inline void vma_iter_invalidate(struct vma_iterator *vmi)
> {
> mas_pause(&vmi->mas);
> @@ -547,31 +610,38 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> vma->vm_lock_seq = UINT_MAX;
> }
>
> -static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> -{
> - struct vm_area_struct *vma = calloc(1, sizeof(struct vm_area_struct));
> +struct kmem_cache {
> + const char *name;
> + size_t object_size;
> + struct kmem_cache_args *args;
> +};
>
> - if (!vma)
> - return NULL;
> +static inline struct kmem_cache *__kmem_cache_create(const char *name,
> + size_t object_size,
> + struct kmem_cache_args *args)
> +{
> + struct kmem_cache *ret = malloc(sizeof(struct kmem_cache));
>
> - vma_init(vma, mm);
> + ret->name = name;
> + ret->object_size = object_size;
> + ret->args = args;
>
> - return vma;
> + return ret;
> }
>
> -static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> -{
> - struct vm_area_struct *new = calloc(1, sizeof(struct vm_area_struct));
> +#define kmem_cache_create(__name, __object_size, __args, ...) \
> + __kmem_cache_create((__name), (__object_size), (__args))
>
> - if (!new)
> - return NULL;
> +static inline void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
> +{
> + (void)gfpflags;
>
> - memcpy(new, orig, sizeof(*new));
> - refcount_set(&new->vm_refcnt, 0);
> - new->vm_lock_seq = UINT_MAX;
> - INIT_LIST_HEAD(&new->anon_vma_chain);
> + return calloc(s->object_size, 1);
> +}
>
> - return new;
> +static inline void kmem_cache_free(struct kmem_cache *s, void *x)
> +{
> + free(x);
> }
>
> /*
> @@ -738,11 +808,6 @@ static inline void mpol_put(struct mempolicy *)
> {
> }
>
> -static inline void vm_area_free(struct vm_area_struct *vma)
> -{
> - free(vma);
> -}
> -
> static inline void lru_add_drain(void)
> {
> }
> @@ -1312,4 +1377,32 @@ static inline void ksm_exit(struct mm_struct *mm)
> (void)mm;
> }
>
> +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
> +{
> + (void)vma;
> + (void)reset_refcnt;
> +}
> +
> +static inline void vma_numab_state_init(struct vm_area_struct *vma)
> +{
> + (void)vma;
> +}
> +
> +static inline void vma_numab_state_free(struct vm_area_struct *vma)
> +{
> + (void)vma;
> +}
> +
> +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
> + struct vm_area_struct *new_vma)
> +{
> + (void)orig_vma;
> + (void)new_vma;
> +}
> +
> +static inline void free_anon_vma_name(struct vm_area_struct *vma)
> +{
> + (void)vma;
> +}
> +
> #endif /* __MM_VMA_INTERNAL_H */
> --
> 2.49.0
>
On 4/28/25 17:28, Lorenzo Stoakes wrote: > Right now these are performed in kernel/fork.c which is odd and a violation > of separation of concerns, as well as preventing us from integrating this > and related logic into userland VMA testing going forward, and perhaps more > importantly - enabling us to, in a subsequent commit, make VMA > allocation/freeing a purely internal mm operation. I wonder if the last part is from an earlier version and now obsolete because there's not subsequent commit in this series and the placement of alloc/freeing in vma_init.c seems making those purely internal mm operations already? Or do you mean some further plans? > There is a fly in the ointment - nommu - mmap.c is not compiled if > CONFIG_MMU not set, and neither is vma.c. > > To square the circle, let's add a new file - vma_init.c. This will be > compiled for both CONFIG_MMU and nommu builds, and will also form part of > the VMA userland testing. > > This allows us to de-duplicate code, while maintaining separation of > concerns and the ability for us to userland test this logic. > > Update the VMA userland tests accordingly, additionally adding a > detach_free_vma() helper function to correctly detach VMAs before freeing > them in test code, as this change was triggering the assert for this. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
On Tue, Apr 29, 2025 at 09:22:59AM +0200, Vlastimil Babka wrote: > On 4/28/25 17:28, Lorenzo Stoakes wrote: > > Right now these are performed in kernel/fork.c which is odd and a violation > > of separation of concerns, as well as preventing us from integrating this > > and related logic into userland VMA testing going forward, and perhaps more > > importantly - enabling us to, in a subsequent commit, make VMA > > allocation/freeing a purely internal mm operation. > > I wonder if the last part is from an earlier version and now obsolete > because there's not subsequent commit in this series and the placement of > alloc/freeing in vma_init.c seems making those purely internal mm operations > already? Or do you mean some further plans? > Sorry, missed this! Andrew - could we delete the last part of this sentence so it reads: Right now these are performed in kernel/fork.c which is odd and a violation of separation of concerns, as well as preventing us from integrating this and related logic into userland VMA testing going forward. Thanks!
On Wed, 30 Apr 2025 10:20:10 +0100 Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > On Tue, Apr 29, 2025 at 09:22:59AM +0200, Vlastimil Babka wrote: > > On 4/28/25 17:28, Lorenzo Stoakes wrote: > > > Right now these are performed in kernel/fork.c which is odd and a violation > > > of separation of concerns, as well as preventing us from integrating this > > > and related logic into userland VMA testing going forward, and perhaps more > > > importantly - enabling us to, in a subsequent commit, make VMA > > > allocation/freeing a purely internal mm operation. > > > > I wonder if the last part is from an earlier version and now obsolete > > because there's not subsequent commit in this series and the placement of > > alloc/freeing in vma_init.c seems making those purely internal mm operations > > already? Or do you mean some further plans? > > > > Sorry, missed this! > > Andrew - could we delete the last part of this sentence so it reads: > > Right now these are performed in kernel/fork.c which is odd and a violation > of separation of concerns, as well as preventing us from integrating this > and related logic into userland VMA testing going forward. Sure. The result: : Right now these are performed in kernel/fork.c which is odd and a : violation of separation of concerns, as well as preventing us from : integrating this and related logic into userland VMA testing going : forward. : : There is a fly in the ointment - nommu - mmap.c is not compiled if : CONFIG_MMU not set, and neither is vma.c. : : To square the circle, let's add a new file - vma_init.c. This will be : compiled for both CONFIG_MMU and nommu builds, and will also form part of : the VMA userland testing. : : This allows us to de-duplicate code, while maintaining separation of : concerns and the ability for us to userland test this logic. : : Update the VMA userland tests accordingly, additionally adding a : detach_free_vma() helper function to correctly detach VMAs before freeing : them in test code, as this change was triggering the assert for this.
On Wed, Apr 30, 2025 at 02:42:36PM -0700, Andrew Morton wrote: > On Wed, 30 Apr 2025 10:20:10 +0100 Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > > > On Tue, Apr 29, 2025 at 09:22:59AM +0200, Vlastimil Babka wrote: > > > On 4/28/25 17:28, Lorenzo Stoakes wrote: > > > > Right now these are performed in kernel/fork.c which is odd and a violation > > > > of separation of concerns, as well as preventing us from integrating this > > > > and related logic into userland VMA testing going forward, and perhaps more > > > > importantly - enabling us to, in a subsequent commit, make VMA > > > > allocation/freeing a purely internal mm operation. > > > > > > I wonder if the last part is from an earlier version and now obsolete > > > because there's not subsequent commit in this series and the placement of > > > alloc/freeing in vma_init.c seems making those purely internal mm operations > > > already? Or do you mean some further plans? > > > > > > > Sorry, missed this! > > > > Andrew - could we delete the last part of this sentence so it reads: > > > > Right now these are performed in kernel/fork.c which is odd and a violation > > of separation of concerns, as well as preventing us from integrating this > > and related logic into userland VMA testing going forward. > > Sure. The result: > > : Right now these are performed in kernel/fork.c which is odd and a > : violation of separation of concerns, as well as preventing us from > : integrating this and related logic into userland VMA testing going > : forward. > : > : There is a fly in the ointment - nommu - mmap.c is not compiled if > : CONFIG_MMU not set, and neither is vma.c. > : > : To square the circle, let's add a new file - vma_init.c. This will be > : compiled for both CONFIG_MMU and nommu builds, and will also form part of > : the VMA userland testing. > : > : This allows us to de-duplicate code, while maintaining separation of > : concerns and the ability for us to userland test this logic. > : > : Update the VMA userland tests accordingly, additionally adding a > : detach_free_vma() helper function to correctly detach VMAs before freeing > : them in test code, as this change was triggering the assert for this. > Perfect, thanks!
Andrew - I maanged to typo vma_init.c to vma_init.h here in mm/vma.c (at least I
was consistent in these silly typos...).
Would you prefer a fixpatch or is it ok for you to quickly fix that up?
Thanks!
On Mon, Apr 28, 2025 at 04:28:17PM +0100, Lorenzo Stoakes wrote:
> Right now these are performed in kernel/fork.c which is odd and a violation
> of separation of concerns, as well as preventing us from integrating this
> and related logic into userland VMA testing going forward, and perhaps more
> importantly - enabling us to, in a subsequent commit, make VMA
> allocation/freeing a purely internal mm operation.
>
> There is a fly in the ointment - nommu - mmap.c is not compiled if
> CONFIG_MMU not set, and neither is vma.c.
>
> To square the circle, let's add a new file - vma_init.c. This will be
> compiled for both CONFIG_MMU and nommu builds, and will also form part of
> the VMA userland testing.
>
> This allows us to de-duplicate code, while maintaining separation of
> concerns and the ability for us to userland test this logic.
>
> Update the VMA userland tests accordingly, additionally adding a
> detach_free_vma() helper function to correctly detach VMAs before freeing
> them in test code, as this change was triggering the assert for this.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> MAINTAINERS | 1 +
> kernel/fork.c | 88 -------------------
> mm/Makefile | 2 +-
> mm/mmap.c | 3 +-
> mm/nommu.c | 4 +-
> mm/vma.h | 7 ++
> mm/vma_init.c | 101 ++++++++++++++++++++++
> tools/testing/vma/Makefile | 2 +-
> tools/testing/vma/vma.c | 26 ++++--
> tools/testing/vma/vma_internal.h | 143 +++++++++++++++++++++++++------
> 10 files changed, 251 insertions(+), 126 deletions(-)
> create mode 100644 mm/vma_init.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1ee1c22e6e36..d274e6802ba5 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -15656,6 +15656,7 @@ F: mm/mseal.c
> F: mm/vma.c
> F: mm/vma.h
> F: mm/vma_exec.c
> +F: mm/vma_init.c
> F: mm/vma_internal.h
> F: tools/testing/selftests/mm/merge.c
> F: tools/testing/vma/
> diff --git a/kernel/fork.c b/kernel/fork.c
> index ac9f9267a473..9e4616dacd82 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -431,88 +431,9 @@ struct kmem_cache *files_cachep;
> /* SLAB cache for fs_struct structures (tsk->fs) */
> struct kmem_cache *fs_cachep;
>
> -/* SLAB cache for vm_area_struct structures */
> -static struct kmem_cache *vm_area_cachep;
> -
> /* SLAB cache for mm_struct structures (tsk->mm) */
> static struct kmem_cache *mm_cachep;
>
> -struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> -{
> - struct vm_area_struct *vma;
> -
> - vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> - if (!vma)
> - return NULL;
> -
> - vma_init(vma, mm);
> -
> - return vma;
> -}
> -
> -static void vm_area_init_from(const struct vm_area_struct *src,
> - struct vm_area_struct *dest)
> -{
> - dest->vm_mm = src->vm_mm;
> - dest->vm_ops = src->vm_ops;
> - dest->vm_start = src->vm_start;
> - dest->vm_end = src->vm_end;
> - dest->anon_vma = src->anon_vma;
> - dest->vm_pgoff = src->vm_pgoff;
> - dest->vm_file = src->vm_file;
> - dest->vm_private_data = src->vm_private_data;
> - vm_flags_init(dest, src->vm_flags);
> - memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> - sizeof(dest->vm_page_prot));
> - /*
> - * src->shared.rb may be modified concurrently when called from
> - * dup_mmap(), but the clone will reinitialize it.
> - */
> - data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> - memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> - sizeof(dest->vm_userfaultfd_ctx));
> -#ifdef CONFIG_ANON_VMA_NAME
> - dest->anon_name = src->anon_name;
> -#endif
> -#ifdef CONFIG_SWAP
> - memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> - sizeof(dest->swap_readahead_info));
> -#endif
> -#ifndef CONFIG_MMU
> - dest->vm_region = src->vm_region;
> -#endif
> -#ifdef CONFIG_NUMA
> - dest->vm_policy = src->vm_policy;
> -#endif
> -}
> -
> -struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> -{
> - struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> -
> - if (!new)
> - return NULL;
> -
> - ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
> - ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
> - vm_area_init_from(orig, new);
> - vma_lock_init(new, true);
> - INIT_LIST_HEAD(&new->anon_vma_chain);
> - vma_numab_state_init(new);
> - dup_anon_vma_name(orig, new);
> -
> - return new;
> -}
> -
> -void vm_area_free(struct vm_area_struct *vma)
> -{
> - /* The vma should be detached while being destroyed. */
> - vma_assert_detached(vma);
> - vma_numab_state_free(vma);
> - free_anon_vma_name(vma);
> - kmem_cache_free(vm_area_cachep, vma);
> -}
> -
> static void account_kernel_stack(struct task_struct *tsk, int account)
> {
> if (IS_ENABLED(CONFIG_VMAP_STACK)) {
> @@ -3033,11 +2954,6 @@ void __init mm_cache_init(void)
>
> void __init proc_caches_init(void)
> {
> - struct kmem_cache_args args = {
> - .use_freeptr_offset = true,
> - .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
> - };
> -
> sighand_cachep = kmem_cache_create("sighand_cache",
> sizeof(struct sighand_struct), 0,
> SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
> @@ -3054,10 +2970,6 @@ void __init proc_caches_init(void)
> sizeof(struct fs_struct), 0,
> SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> NULL);
> - vm_area_cachep = kmem_cache_create("vm_area_struct",
> - sizeof(struct vm_area_struct), &args,
> - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
> - SLAB_ACCOUNT);
> mmap_init();
> nsproxy_cache_init();
> }
> diff --git a/mm/Makefile b/mm/Makefile
> index 15a901bb431a..690ddcf7d9a1 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -55,7 +55,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
> mm_init.o percpu.o slab_common.o \
> compaction.o show_mem.o \
> interval_tree.o list_lru.o workingset.o \
> - debug.o gup.o mmap_lock.o $(mmu-y)
> + debug.o gup.o mmap_lock.o vma_init.o $(mmu-y)
>
> # Give 'page_alloc' its own module-parameter namespace
> page-alloc-y := page_alloc.o
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 5259df031e15..81dd962a1cfc 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1554,7 +1554,7 @@ static const struct ctl_table mmap_table[] = {
> #endif /* CONFIG_SYSCTL */
>
> /*
> - * initialise the percpu counter for VM
> + * initialise the percpu counter for VM, initialise VMA state.
> */
> void __init mmap_init(void)
> {
> @@ -1565,6 +1565,7 @@ void __init mmap_init(void)
> #ifdef CONFIG_SYSCTL
> register_sysctl_init("vm", mmap_table);
> #endif
> + vma_state_init();
> }
>
> /*
> diff --git a/mm/nommu.c b/mm/nommu.c
> index a142fc258d39..0bf4849b8204 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -399,7 +399,8 @@ static const struct ctl_table nommu_table[] = {
> };
>
> /*
> - * initialise the percpu counter for VM and region record slabs
> + * initialise the percpu counter for VM and region record slabs, initialise VMA
> + * state.
> */
> void __init mmap_init(void)
> {
> @@ -409,6 +410,7 @@ void __init mmap_init(void)
> VM_BUG_ON(ret);
> vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
> register_sysctl_init("vm", nommu_table);
> + vma_state_init();
> }
>
> /*
> diff --git a/mm/vma.h b/mm/vma.h
> index 94307a2e4ab6..4a1e1768ca46 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -548,8 +548,15 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address);
>
> int __vm_munmap(unsigned long start, size_t len, bool unlock);
>
> +
> int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma);
>
> +/* vma_init.h, shared between CONFIG_MMU and nommu. */
> +void __init vma_state_init(void);
> +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm);
> +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig);
> +void vm_area_free(struct vm_area_struct *vma);
> +
> /* vma_exec.h */
> #ifdef CONFIG_MMU
> int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
> diff --git a/mm/vma_init.c b/mm/vma_init.c
> new file mode 100644
> index 000000000000..967ca8517986
> --- /dev/null
> +++ b/mm/vma_init.c
> @@ -0,0 +1,101 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +/*
> + * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared
> + * between CONFIG_MMU and non-CONFIG_MMU kernel configurations.
> + */
> +
> +#include "vma_internal.h"
> +#include "vma.h"
> +
> +/* SLAB cache for vm_area_struct structures */
> +static struct kmem_cache *vm_area_cachep;
> +
> +void __init vma_state_init(void)
> +{
> + struct kmem_cache_args args = {
> + .use_freeptr_offset = true,
> + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
> + };
> +
> + vm_area_cachep = kmem_cache_create("vm_area_struct",
> + sizeof(struct vm_area_struct), &args,
> + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
> + SLAB_ACCOUNT);
> +}
> +
> +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> +{
> + struct vm_area_struct *vma;
> +
> + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> + if (!vma)
> + return NULL;
> +
> + vma_init(vma, mm);
> +
> + return vma;
> +}
> +
> +static void vm_area_init_from(const struct vm_area_struct *src,
> + struct vm_area_struct *dest)
> +{
> + dest->vm_mm = src->vm_mm;
> + dest->vm_ops = src->vm_ops;
> + dest->vm_start = src->vm_start;
> + dest->vm_end = src->vm_end;
> + dest->anon_vma = src->anon_vma;
> + dest->vm_pgoff = src->vm_pgoff;
> + dest->vm_file = src->vm_file;
> + dest->vm_private_data = src->vm_private_data;
> + vm_flags_init(dest, src->vm_flags);
> + memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> + sizeof(dest->vm_page_prot));
> + /*
> + * src->shared.rb may be modified concurrently when called from
> + * dup_mmap(), but the clone will reinitialize it.
> + */
> + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> + sizeof(dest->vm_userfaultfd_ctx));
> +#ifdef CONFIG_ANON_VMA_NAME
> + dest->anon_name = src->anon_name;
> +#endif
> +#ifdef CONFIG_SWAP
> + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> + sizeof(dest->swap_readahead_info));
> +#endif
> +#ifndef CONFIG_MMU
> + dest->vm_region = src->vm_region;
> +#endif
> +#ifdef CONFIG_NUMA
> + dest->vm_policy = src->vm_policy;
> +#endif
> +}
> +
> +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> +{
> + struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> +
> + if (!new)
> + return NULL;
> +
> + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
> + ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
> + vm_area_init_from(orig, new);
> + vma_lock_init(new, true);
> + INIT_LIST_HEAD(&new->anon_vma_chain);
> + vma_numab_state_init(new);
> + dup_anon_vma_name(orig, new);
> +
> + return new;
> +}
> +
> +void vm_area_free(struct vm_area_struct *vma)
> +{
> + /* The vma should be detached while being destroyed. */
> + vma_assert_detached(vma);
> + vma_numab_state_free(vma);
> + free_anon_vma_name(vma);
> + kmem_cache_free(vm_area_cachep, vma);
> +}
> diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile
> index 624040fcf193..66f3831a668f 100644
> --- a/tools/testing/vma/Makefile
> +++ b/tools/testing/vma/Makefile
> @@ -9,7 +9,7 @@ include ../shared/shared.mk
> OFILES = $(SHARED_OFILES) vma.o maple-shim.o
> TARGETS = vma
>
> -vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_exec.c ../../../mm/vma.h
> +vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h
>
> vma: $(OFILES)
> $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)
> diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
> index 5832ae5d797d..2be7597a2ac2 100644
> --- a/tools/testing/vma/vma.c
> +++ b/tools/testing/vma/vma.c
> @@ -28,6 +28,7 @@ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
> * Directly import the VMA implementation here. Our vma_internal.h wrapper
> * provides userland-equivalent functionality for everything vma.c uses.
> */
> +#include "../../../mm/vma_init.c"
> #include "../../../mm/vma_exec.c"
> #include "../../../mm/vma.c"
>
> @@ -91,6 +92,12 @@ static int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma)
> return res;
> }
>
> +static void detach_free_vma(struct vm_area_struct *vma)
> +{
> + vma_mark_detached(vma);
> + vm_area_free(vma);
> +}
> +
> /* Helper function to allocate a VMA and link it to the tree. */
> static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
> unsigned long start,
> @@ -104,7 +111,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
> return NULL;
>
> if (attach_vma(mm, vma)) {
> - vm_area_free(vma);
> + detach_free_vma(vma);
> return NULL;
> }
>
> @@ -249,7 +256,7 @@ static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi)
>
> vma_iter_set(vmi, 0);
> for_each_vma(*vmi, vma) {
> - vm_area_free(vma);
> + detach_free_vma(vma);
> count++;
> }
>
> @@ -319,7 +326,7 @@ static bool test_simple_merge(void)
> ASSERT_EQ(vma->vm_pgoff, 0);
> ASSERT_EQ(vma->vm_flags, flags);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> mtree_destroy(&mm.mm_mt);
>
> return true;
> @@ -361,7 +368,7 @@ static bool test_simple_modify(void)
> ASSERT_EQ(vma->vm_end, 0x1000);
> ASSERT_EQ(vma->vm_pgoff, 0);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> vma_iter_clear(&vmi);
>
> vma = vma_next(&vmi);
> @@ -370,7 +377,7 @@ static bool test_simple_modify(void)
> ASSERT_EQ(vma->vm_end, 0x2000);
> ASSERT_EQ(vma->vm_pgoff, 1);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> vma_iter_clear(&vmi);
>
> vma = vma_next(&vmi);
> @@ -379,7 +386,7 @@ static bool test_simple_modify(void)
> ASSERT_EQ(vma->vm_end, 0x3000);
> ASSERT_EQ(vma->vm_pgoff, 2);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> mtree_destroy(&mm.mm_mt);
>
> return true;
> @@ -407,7 +414,7 @@ static bool test_simple_expand(void)
> ASSERT_EQ(vma->vm_end, 0x3000);
> ASSERT_EQ(vma->vm_pgoff, 0);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> mtree_destroy(&mm.mm_mt);
>
> return true;
> @@ -428,7 +435,7 @@ static bool test_simple_shrink(void)
> ASSERT_EQ(vma->vm_end, 0x1000);
> ASSERT_EQ(vma->vm_pgoff, 0);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> mtree_destroy(&mm.mm_mt);
>
> return true;
> @@ -619,7 +626,7 @@ static bool test_merge_new(void)
> ASSERT_EQ(vma->vm_pgoff, 0);
> ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> count++;
> }
>
> @@ -1668,6 +1675,7 @@ int main(void)
> int num_tests = 0, num_fail = 0;
>
> maple_tree_init();
> + vma_state_init();
>
> #define TEST(name) \
> do { \
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index 32e990313158..198abe66de5a 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -155,6 +155,10 @@ typedef __bitwise unsigned int vm_fault_t;
> */
> #define pr_warn_once pr_err
>
> +#define data_race(expr) expr
> +
> +#define ASSERT_EXCLUSIVE_WRITER(x)
> +
> struct kref {
> refcount_t refcount;
> };
> @@ -255,6 +259,8 @@ struct file {
>
> #define VMA_LOCK_OFFSET 0x40000000
>
> +typedef struct { unsigned long v; } freeptr_t;
> +
> struct vm_area_struct {
> /* The first cache line has the info for VMA tree walking. */
>
> @@ -264,9 +270,7 @@ struct vm_area_struct {
> unsigned long vm_start;
> unsigned long vm_end;
> };
> -#ifdef CONFIG_PER_VMA_LOCK
> - struct rcu_head vm_rcu; /* Used for deferred freeing. */
> -#endif
> + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
> };
>
> struct mm_struct *vm_mm; /* The address space we belong to. */
> @@ -463,6 +467,65 @@ struct pagetable_move_control {
> .len_in = len_, \
> }
>
> +struct kmem_cache_args {
> + /**
> + * @align: The required alignment for the objects.
> + *
> + * %0 means no specific alignment is requested.
> + */
> + unsigned int align;
> + /**
> + * @useroffset: Usercopy region offset.
> + *
> + * %0 is a valid offset, when @usersize is non-%0
> + */
> + unsigned int useroffset;
> + /**
> + * @usersize: Usercopy region size.
> + *
> + * %0 means no usercopy region is specified.
> + */
> + unsigned int usersize;
> + /**
> + * @freeptr_offset: Custom offset for the free pointer
> + * in &SLAB_TYPESAFE_BY_RCU caches
> + *
> + * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer
> + * outside of the object. This might cause the object to grow in size.
> + * Cache creators that have a reason to avoid this can specify a custom
> + * free pointer offset in their struct where the free pointer will be
> + * placed.
> + *
> + * Note that placing the free pointer inside the object requires the
> + * caller to ensure that no fields are invalidated that are required to
> + * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for
> + * details).
> + *
> + * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset
> + * is specified, %use_freeptr_offset must be set %true.
> + *
> + * Note that @ctor currently isn't supported with custom free pointers
> + * as a @ctor requires an external free pointer.
> + */
> + unsigned int freeptr_offset;
> + /**
> + * @use_freeptr_offset: Whether a @freeptr_offset is used.
> + */
> + bool use_freeptr_offset;
> + /**
> + * @ctor: A constructor for the objects.
> + *
> + * The constructor is invoked for each object in a newly allocated slab
> + * page. It is the cache user's responsibility to free object in the
> + * same state as after calling the constructor, or deal appropriately
> + * with any differences between a freshly constructed and a reallocated
> + * object.
> + *
> + * %NULL means no constructor.
> + */
> + void (*ctor)(void *);
> +};
> +
> static inline void vma_iter_invalidate(struct vma_iterator *vmi)
> {
> mas_pause(&vmi->mas);
> @@ -547,31 +610,38 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> vma->vm_lock_seq = UINT_MAX;
> }
>
> -static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> -{
> - struct vm_area_struct *vma = calloc(1, sizeof(struct vm_area_struct));
> +struct kmem_cache {
> + const char *name;
> + size_t object_size;
> + struct kmem_cache_args *args;
> +};
>
> - if (!vma)
> - return NULL;
> +static inline struct kmem_cache *__kmem_cache_create(const char *name,
> + size_t object_size,
> + struct kmem_cache_args *args)
> +{
> + struct kmem_cache *ret = malloc(sizeof(struct kmem_cache));
>
> - vma_init(vma, mm);
> + ret->name = name;
> + ret->object_size = object_size;
> + ret->args = args;
>
> - return vma;
> + return ret;
> }
>
> -static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> -{
> - struct vm_area_struct *new = calloc(1, sizeof(struct vm_area_struct));
> +#define kmem_cache_create(__name, __object_size, __args, ...) \
> + __kmem_cache_create((__name), (__object_size), (__args))
>
> - if (!new)
> - return NULL;
> +static inline void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
> +{
> + (void)gfpflags;
>
> - memcpy(new, orig, sizeof(*new));
> - refcount_set(&new->vm_refcnt, 0);
> - new->vm_lock_seq = UINT_MAX;
> - INIT_LIST_HEAD(&new->anon_vma_chain);
> + return calloc(s->object_size, 1);
> +}
>
> - return new;
> +static inline void kmem_cache_free(struct kmem_cache *s, void *x)
> +{
> + free(x);
> }
>
> /*
> @@ -738,11 +808,6 @@ static inline void mpol_put(struct mempolicy *)
> {
> }
>
> -static inline void vm_area_free(struct vm_area_struct *vma)
> -{
> - free(vma);
> -}
> -
> static inline void lru_add_drain(void)
> {
> }
> @@ -1312,4 +1377,32 @@ static inline void ksm_exit(struct mm_struct *mm)
> (void)mm;
> }
>
> +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
> +{
> + (void)vma;
> + (void)reset_refcnt;
> +}
> +
> +static inline void vma_numab_state_init(struct vm_area_struct *vma)
> +{
> + (void)vma;
> +}
> +
> +static inline void vma_numab_state_free(struct vm_area_struct *vma)
> +{
> + (void)vma;
> +}
> +
> +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
> + struct vm_area_struct *new_vma)
> +{
> + (void)orig_vma;
> + (void)new_vma;
> +}
> +
> +static inline void free_anon_vma_name(struct vm_area_struct *vma)
> +{
> + (void)vma;
> +}
> +
> #endif /* __MM_VMA_INTERNAL_H */
> --
> 2.49.0
>
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250428 11:28]:
> Right now these are performed in kernel/fork.c which is odd and a violation
> of separation of concerns, as well as preventing us from integrating this
> and related logic into userland VMA testing going forward, and perhaps more
> importantly - enabling us to, in a subsequent commit, make VMA
> allocation/freeing a purely internal mm operation.
>
> There is a fly in the ointment - nommu - mmap.c is not compiled if
> CONFIG_MMU not set, and neither is vma.c.
>
> To square the circle, let's add a new file - vma_init.c. This will be
> compiled for both CONFIG_MMU and nommu builds, and will also form part of
> the VMA userland testing.
>
> This allows us to de-duplicate code, while maintaining separation of
> concerns and the ability for us to userland test this logic.
>
> Update the VMA userland tests accordingly, additionally adding a
> detach_free_vma() helper function to correctly detach VMAs before freeing
> them in test code, as this change was triggering the assert for this.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
One small nit below.
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
> ---
> MAINTAINERS | 1 +
> kernel/fork.c | 88 -------------------
> mm/Makefile | 2 +-
> mm/mmap.c | 3 +-
> mm/nommu.c | 4 +-
> mm/vma.h | 7 ++
> mm/vma_init.c | 101 ++++++++++++++++++++++
> tools/testing/vma/Makefile | 2 +-
> tools/testing/vma/vma.c | 26 ++++--
> tools/testing/vma/vma_internal.h | 143 +++++++++++++++++++++++++------
> 10 files changed, 251 insertions(+), 126 deletions(-)
> create mode 100644 mm/vma_init.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1ee1c22e6e36..d274e6802ba5 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -15656,6 +15656,7 @@ F: mm/mseal.c
> F: mm/vma.c
> F: mm/vma.h
> F: mm/vma_exec.c
> +F: mm/vma_init.c
> F: mm/vma_internal.h
> F: tools/testing/selftests/mm/merge.c
> F: tools/testing/vma/
> diff --git a/kernel/fork.c b/kernel/fork.c
> index ac9f9267a473..9e4616dacd82 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -431,88 +431,9 @@ struct kmem_cache *files_cachep;
> /* SLAB cache for fs_struct structures (tsk->fs) */
> struct kmem_cache *fs_cachep;
>
> -/* SLAB cache for vm_area_struct structures */
> -static struct kmem_cache *vm_area_cachep;
> -
> /* SLAB cache for mm_struct structures (tsk->mm) */
> static struct kmem_cache *mm_cachep;
>
> -struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> -{
> - struct vm_area_struct *vma;
> -
> - vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> - if (!vma)
> - return NULL;
> -
> - vma_init(vma, mm);
> -
> - return vma;
> -}
> -
> -static void vm_area_init_from(const struct vm_area_struct *src,
> - struct vm_area_struct *dest)
> -{
> - dest->vm_mm = src->vm_mm;
> - dest->vm_ops = src->vm_ops;
> - dest->vm_start = src->vm_start;
> - dest->vm_end = src->vm_end;
> - dest->anon_vma = src->anon_vma;
> - dest->vm_pgoff = src->vm_pgoff;
> - dest->vm_file = src->vm_file;
> - dest->vm_private_data = src->vm_private_data;
> - vm_flags_init(dest, src->vm_flags);
> - memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> - sizeof(dest->vm_page_prot));
> - /*
> - * src->shared.rb may be modified concurrently when called from
> - * dup_mmap(), but the clone will reinitialize it.
> - */
> - data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> - memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> - sizeof(dest->vm_userfaultfd_ctx));
> -#ifdef CONFIG_ANON_VMA_NAME
> - dest->anon_name = src->anon_name;
> -#endif
> -#ifdef CONFIG_SWAP
> - memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> - sizeof(dest->swap_readahead_info));
> -#endif
> -#ifndef CONFIG_MMU
> - dest->vm_region = src->vm_region;
> -#endif
> -#ifdef CONFIG_NUMA
> - dest->vm_policy = src->vm_policy;
> -#endif
> -}
> -
> -struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> -{
> - struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> -
> - if (!new)
> - return NULL;
> -
> - ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
> - ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
> - vm_area_init_from(orig, new);
> - vma_lock_init(new, true);
> - INIT_LIST_HEAD(&new->anon_vma_chain);
> - vma_numab_state_init(new);
> - dup_anon_vma_name(orig, new);
> -
> - return new;
> -}
> -
> -void vm_area_free(struct vm_area_struct *vma)
> -{
> - /* The vma should be detached while being destroyed. */
> - vma_assert_detached(vma);
> - vma_numab_state_free(vma);
> - free_anon_vma_name(vma);
> - kmem_cache_free(vm_area_cachep, vma);
> -}
> -
> static void account_kernel_stack(struct task_struct *tsk, int account)
> {
> if (IS_ENABLED(CONFIG_VMAP_STACK)) {
> @@ -3033,11 +2954,6 @@ void __init mm_cache_init(void)
>
> void __init proc_caches_init(void)
> {
> - struct kmem_cache_args args = {
> - .use_freeptr_offset = true,
> - .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
> - };
> -
> sighand_cachep = kmem_cache_create("sighand_cache",
> sizeof(struct sighand_struct), 0,
> SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
> @@ -3054,10 +2970,6 @@ void __init proc_caches_init(void)
> sizeof(struct fs_struct), 0,
> SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> NULL);
> - vm_area_cachep = kmem_cache_create("vm_area_struct",
> - sizeof(struct vm_area_struct), &args,
> - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
> - SLAB_ACCOUNT);
> mmap_init();
> nsproxy_cache_init();
> }
> diff --git a/mm/Makefile b/mm/Makefile
> index 15a901bb431a..690ddcf7d9a1 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -55,7 +55,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
> mm_init.o percpu.o slab_common.o \
> compaction.o show_mem.o \
> interval_tree.o list_lru.o workingset.o \
> - debug.o gup.o mmap_lock.o $(mmu-y)
> + debug.o gup.o mmap_lock.o vma_init.o $(mmu-y)
>
> # Give 'page_alloc' its own module-parameter namespace
> page-alloc-y := page_alloc.o
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 5259df031e15..81dd962a1cfc 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1554,7 +1554,7 @@ static const struct ctl_table mmap_table[] = {
> #endif /* CONFIG_SYSCTL */
>
> /*
> - * initialise the percpu counter for VM
> + * initialise the percpu counter for VM, initialise VMA state.
> */
> void __init mmap_init(void)
> {
> @@ -1565,6 +1565,7 @@ void __init mmap_init(void)
> #ifdef CONFIG_SYSCTL
> register_sysctl_init("vm", mmap_table);
> #endif
> + vma_state_init();
> }
>
> /*
> diff --git a/mm/nommu.c b/mm/nommu.c
> index a142fc258d39..0bf4849b8204 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -399,7 +399,8 @@ static const struct ctl_table nommu_table[] = {
> };
>
> /*
> - * initialise the percpu counter for VM and region record slabs
> + * initialise the percpu counter for VM and region record slabs, initialise VMA
> + * state.
> */
> void __init mmap_init(void)
> {
> @@ -409,6 +410,7 @@ void __init mmap_init(void)
> VM_BUG_ON(ret);
> vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
> register_sysctl_init("vm", nommu_table);
> + vma_state_init();
> }
>
> /*
> diff --git a/mm/vma.h b/mm/vma.h
> index 94307a2e4ab6..4a1e1768ca46 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -548,8 +548,15 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address);
>
> int __vm_munmap(unsigned long start, size_t len, bool unlock);
>
> +
Accidental extra line here?
> int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma);
>
> +/* vma_init.h, shared between CONFIG_MMU and nommu. */
> +void __init vma_state_init(void);
> +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm);
> +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig);
> +void vm_area_free(struct vm_area_struct *vma);
> +
> /* vma_exec.h */
> #ifdef CONFIG_MMU
> int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
> diff --git a/mm/vma_init.c b/mm/vma_init.c
> new file mode 100644
> index 000000000000..967ca8517986
> --- /dev/null
> +++ b/mm/vma_init.c
> @@ -0,0 +1,101 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +
> +/*
> + * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared
> + * between CONFIG_MMU and non-CONFIG_MMU kernel configurations.
> + */
> +
> +#include "vma_internal.h"
> +#include "vma.h"
> +
> +/* SLAB cache for vm_area_struct structures */
> +static struct kmem_cache *vm_area_cachep;
> +
> +void __init vma_state_init(void)
> +{
> + struct kmem_cache_args args = {
> + .use_freeptr_offset = true,
> + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
> + };
> +
> + vm_area_cachep = kmem_cache_create("vm_area_struct",
> + sizeof(struct vm_area_struct), &args,
> + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
> + SLAB_ACCOUNT);
> +}
> +
> +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> +{
> + struct vm_area_struct *vma;
> +
> + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> + if (!vma)
> + return NULL;
> +
> + vma_init(vma, mm);
> +
> + return vma;
> +}
> +
> +static void vm_area_init_from(const struct vm_area_struct *src,
> + struct vm_area_struct *dest)
> +{
> + dest->vm_mm = src->vm_mm;
> + dest->vm_ops = src->vm_ops;
> + dest->vm_start = src->vm_start;
> + dest->vm_end = src->vm_end;
> + dest->anon_vma = src->anon_vma;
> + dest->vm_pgoff = src->vm_pgoff;
> + dest->vm_file = src->vm_file;
> + dest->vm_private_data = src->vm_private_data;
> + vm_flags_init(dest, src->vm_flags);
> + memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> + sizeof(dest->vm_page_prot));
> + /*
> + * src->shared.rb may be modified concurrently when called from
> + * dup_mmap(), but the clone will reinitialize it.
> + */
> + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> + sizeof(dest->vm_userfaultfd_ctx));
> +#ifdef CONFIG_ANON_VMA_NAME
> + dest->anon_name = src->anon_name;
> +#endif
> +#ifdef CONFIG_SWAP
> + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> + sizeof(dest->swap_readahead_info));
> +#endif
> +#ifndef CONFIG_MMU
> + dest->vm_region = src->vm_region;
> +#endif
> +#ifdef CONFIG_NUMA
> + dest->vm_policy = src->vm_policy;
> +#endif
> +}
> +
> +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> +{
> + struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> +
> + if (!new)
> + return NULL;
> +
> + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
> + ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
> + vm_area_init_from(orig, new);
> + vma_lock_init(new, true);
> + INIT_LIST_HEAD(&new->anon_vma_chain);
> + vma_numab_state_init(new);
> + dup_anon_vma_name(orig, new);
> +
> + return new;
> +}
> +
> +void vm_area_free(struct vm_area_struct *vma)
> +{
> + /* The vma should be detached while being destroyed. */
> + vma_assert_detached(vma);
> + vma_numab_state_free(vma);
> + free_anon_vma_name(vma);
> + kmem_cache_free(vm_area_cachep, vma);
> +}
> diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile
> index 624040fcf193..66f3831a668f 100644
> --- a/tools/testing/vma/Makefile
> +++ b/tools/testing/vma/Makefile
> @@ -9,7 +9,7 @@ include ../shared/shared.mk
> OFILES = $(SHARED_OFILES) vma.o maple-shim.o
> TARGETS = vma
>
> -vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_exec.c ../../../mm/vma.h
> +vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h
>
> vma: $(OFILES)
> $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)
> diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
> index 5832ae5d797d..2be7597a2ac2 100644
> --- a/tools/testing/vma/vma.c
> +++ b/tools/testing/vma/vma.c
> @@ -28,6 +28,7 @@ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
> * Directly import the VMA implementation here. Our vma_internal.h wrapper
> * provides userland-equivalent functionality for everything vma.c uses.
> */
> +#include "../../../mm/vma_init.c"
> #include "../../../mm/vma_exec.c"
> #include "../../../mm/vma.c"
>
> @@ -91,6 +92,12 @@ static int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma)
> return res;
> }
>
> +static void detach_free_vma(struct vm_area_struct *vma)
> +{
> + vma_mark_detached(vma);
> + vm_area_free(vma);
> +}
> +
> /* Helper function to allocate a VMA and link it to the tree. */
> static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
> unsigned long start,
> @@ -104,7 +111,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
> return NULL;
>
> if (attach_vma(mm, vma)) {
> - vm_area_free(vma);
> + detach_free_vma(vma);
> return NULL;
> }
>
> @@ -249,7 +256,7 @@ static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi)
>
> vma_iter_set(vmi, 0);
> for_each_vma(*vmi, vma) {
> - vm_area_free(vma);
> + detach_free_vma(vma);
> count++;
> }
>
> @@ -319,7 +326,7 @@ static bool test_simple_merge(void)
> ASSERT_EQ(vma->vm_pgoff, 0);
> ASSERT_EQ(vma->vm_flags, flags);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> mtree_destroy(&mm.mm_mt);
>
> return true;
> @@ -361,7 +368,7 @@ static bool test_simple_modify(void)
> ASSERT_EQ(vma->vm_end, 0x1000);
> ASSERT_EQ(vma->vm_pgoff, 0);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> vma_iter_clear(&vmi);
>
> vma = vma_next(&vmi);
> @@ -370,7 +377,7 @@ static bool test_simple_modify(void)
> ASSERT_EQ(vma->vm_end, 0x2000);
> ASSERT_EQ(vma->vm_pgoff, 1);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> vma_iter_clear(&vmi);
>
> vma = vma_next(&vmi);
> @@ -379,7 +386,7 @@ static bool test_simple_modify(void)
> ASSERT_EQ(vma->vm_end, 0x3000);
> ASSERT_EQ(vma->vm_pgoff, 2);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> mtree_destroy(&mm.mm_mt);
>
> return true;
> @@ -407,7 +414,7 @@ static bool test_simple_expand(void)
> ASSERT_EQ(vma->vm_end, 0x3000);
> ASSERT_EQ(vma->vm_pgoff, 0);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> mtree_destroy(&mm.mm_mt);
>
> return true;
> @@ -428,7 +435,7 @@ static bool test_simple_shrink(void)
> ASSERT_EQ(vma->vm_end, 0x1000);
> ASSERT_EQ(vma->vm_pgoff, 0);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> mtree_destroy(&mm.mm_mt);
>
> return true;
> @@ -619,7 +626,7 @@ static bool test_merge_new(void)
> ASSERT_EQ(vma->vm_pgoff, 0);
> ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
>
> - vm_area_free(vma);
> + detach_free_vma(vma);
> count++;
> }
>
> @@ -1668,6 +1675,7 @@ int main(void)
> int num_tests = 0, num_fail = 0;
>
> maple_tree_init();
> + vma_state_init();
>
> #define TEST(name) \
> do { \
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index 32e990313158..198abe66de5a 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -155,6 +155,10 @@ typedef __bitwise unsigned int vm_fault_t;
> */
> #define pr_warn_once pr_err
>
> +#define data_race(expr) expr
> +
> +#define ASSERT_EXCLUSIVE_WRITER(x)
> +
> struct kref {
> refcount_t refcount;
> };
> @@ -255,6 +259,8 @@ struct file {
>
> #define VMA_LOCK_OFFSET 0x40000000
>
> +typedef struct { unsigned long v; } freeptr_t;
> +
> struct vm_area_struct {
> /* The first cache line has the info for VMA tree walking. */
>
> @@ -264,9 +270,7 @@ struct vm_area_struct {
> unsigned long vm_start;
> unsigned long vm_end;
> };
> -#ifdef CONFIG_PER_VMA_LOCK
> - struct rcu_head vm_rcu; /* Used for deferred freeing. */
> -#endif
> + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
> };
>
> struct mm_struct *vm_mm; /* The address space we belong to. */
> @@ -463,6 +467,65 @@ struct pagetable_move_control {
> .len_in = len_, \
> }
>
> +struct kmem_cache_args {
> + /**
> + * @align: The required alignment for the objects.
> + *
> + * %0 means no specific alignment is requested.
> + */
> + unsigned int align;
> + /**
> + * @useroffset: Usercopy region offset.
> + *
> + * %0 is a valid offset, when @usersize is non-%0
> + */
> + unsigned int useroffset;
> + /**
> + * @usersize: Usercopy region size.
> + *
> + * %0 means no usercopy region is specified.
> + */
> + unsigned int usersize;
> + /**
> + * @freeptr_offset: Custom offset for the free pointer
> + * in &SLAB_TYPESAFE_BY_RCU caches
> + *
> + * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer
> + * outside of the object. This might cause the object to grow in size.
> + * Cache creators that have a reason to avoid this can specify a custom
> + * free pointer offset in their struct where the free pointer will be
> + * placed.
> + *
> + * Note that placing the free pointer inside the object requires the
> + * caller to ensure that no fields are invalidated that are required to
> + * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for
> + * details).
> + *
> + * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset
> + * is specified, %use_freeptr_offset must be set %true.
> + *
> + * Note that @ctor currently isn't supported with custom free pointers
> + * as a @ctor requires an external free pointer.
> + */
> + unsigned int freeptr_offset;
> + /**
> + * @use_freeptr_offset: Whether a @freeptr_offset is used.
> + */
> + bool use_freeptr_offset;
> + /**
> + * @ctor: A constructor for the objects.
> + *
> + * The constructor is invoked for each object in a newly allocated slab
> + * page. It is the cache user's responsibility to free object in the
> + * same state as after calling the constructor, or deal appropriately
> + * with any differences between a freshly constructed and a reallocated
> + * object.
> + *
> + * %NULL means no constructor.
> + */
> + void (*ctor)(void *);
> +};
> +
> static inline void vma_iter_invalidate(struct vma_iterator *vmi)
> {
> mas_pause(&vmi->mas);
> @@ -547,31 +610,38 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> vma->vm_lock_seq = UINT_MAX;
> }
>
> -static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> -{
> - struct vm_area_struct *vma = calloc(1, sizeof(struct vm_area_struct));
> +struct kmem_cache {
> + const char *name;
> + size_t object_size;
> + struct kmem_cache_args *args;
> +};
>
> - if (!vma)
> - return NULL;
> +static inline struct kmem_cache *__kmem_cache_create(const char *name,
> + size_t object_size,
> + struct kmem_cache_args *args)
> +{
> + struct kmem_cache *ret = malloc(sizeof(struct kmem_cache));
>
> - vma_init(vma, mm);
> + ret->name = name;
> + ret->object_size = object_size;
> + ret->args = args;
>
> - return vma;
> + return ret;
> }
>
> -static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> -{
> - struct vm_area_struct *new = calloc(1, sizeof(struct vm_area_struct));
> +#define kmem_cache_create(__name, __object_size, __args, ...) \
> + __kmem_cache_create((__name), (__object_size), (__args))
>
> - if (!new)
> - return NULL;
> +static inline void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
> +{
> + (void)gfpflags;
>
> - memcpy(new, orig, sizeof(*new));
> - refcount_set(&new->vm_refcnt, 0);
> - new->vm_lock_seq = UINT_MAX;
> - INIT_LIST_HEAD(&new->anon_vma_chain);
> + return calloc(s->object_size, 1);
> +}
>
> - return new;
> +static inline void kmem_cache_free(struct kmem_cache *s, void *x)
> +{
> + free(x);
> }
>
> /*
> @@ -738,11 +808,6 @@ static inline void mpol_put(struct mempolicy *)
> {
> }
>
> -static inline void vm_area_free(struct vm_area_struct *vma)
> -{
> - free(vma);
> -}
> -
> static inline void lru_add_drain(void)
> {
> }
> @@ -1312,4 +1377,32 @@ static inline void ksm_exit(struct mm_struct *mm)
> (void)mm;
> }
>
> +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
> +{
> + (void)vma;
> + (void)reset_refcnt;
> +}
> +
> +static inline void vma_numab_state_init(struct vm_area_struct *vma)
> +{
> + (void)vma;
> +}
> +
> +static inline void vma_numab_state_free(struct vm_area_struct *vma)
> +{
> + (void)vma;
> +}
> +
> +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
> + struct vm_area_struct *new_vma)
> +{
> + (void)orig_vma;
> + (void)new_vma;
> +}
> +
> +static inline void free_anon_vma_name(struct vm_area_struct *vma)
> +{
> + (void)vma;
> +}
> +
> #endif /* __MM_VMA_INTERNAL_H */
> --
> 2.49.0
>
On Mon, Apr 28, 2025 at 03:14:46PM -0400, Liam R. Howlett wrote:
> * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250428 11:28]:
> > Right now these are performed in kernel/fork.c which is odd and a violation
> > of separation of concerns, as well as preventing us from integrating this
> > and related logic into userland VMA testing going forward, and perhaps more
> > importantly - enabling us to, in a subsequent commit, make VMA
> > allocation/freeing a purely internal mm operation.
> >
> > There is a fly in the ointment - nommu - mmap.c is not compiled if
> > CONFIG_MMU not set, and neither is vma.c.
> >
> > To square the circle, let's add a new file - vma_init.c. This will be
> > compiled for both CONFIG_MMU and nommu builds, and will also form part of
> > the VMA userland testing.
> >
> > This allows us to de-duplicate code, while maintaining separation of
> > concerns and the ability for us to userland test this logic.
> >
> > Update the VMA userland tests accordingly, additionally adding a
> > detach_free_vma() helper function to correctly detach VMAs before freeing
> > them in test code, as this change was triggering the assert for this.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>
> One small nit below.
>
> Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Thanks!
I see Andrew already fixed up the nit :)
>
> > ---
> > MAINTAINERS | 1 +
> > kernel/fork.c | 88 -------------------
> > mm/Makefile | 2 +-
> > mm/mmap.c | 3 +-
> > mm/nommu.c | 4 +-
> > mm/vma.h | 7 ++
> > mm/vma_init.c | 101 ++++++++++++++++++++++
> > tools/testing/vma/Makefile | 2 +-
> > tools/testing/vma/vma.c | 26 ++++--
> > tools/testing/vma/vma_internal.h | 143 +++++++++++++++++++++++++------
> > 10 files changed, 251 insertions(+), 126 deletions(-)
> > create mode 100644 mm/vma_init.c
> >
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index 1ee1c22e6e36..d274e6802ba5 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -15656,6 +15656,7 @@ F: mm/mseal.c
> > F: mm/vma.c
> > F: mm/vma.h
> > F: mm/vma_exec.c
> > +F: mm/vma_init.c
> > F: mm/vma_internal.h
> > F: tools/testing/selftests/mm/merge.c
> > F: tools/testing/vma/
> > diff --git a/kernel/fork.c b/kernel/fork.c
> > index ac9f9267a473..9e4616dacd82 100644
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -431,88 +431,9 @@ struct kmem_cache *files_cachep;
> > /* SLAB cache for fs_struct structures (tsk->fs) */
> > struct kmem_cache *fs_cachep;
> >
> > -/* SLAB cache for vm_area_struct structures */
> > -static struct kmem_cache *vm_area_cachep;
> > -
> > /* SLAB cache for mm_struct structures (tsk->mm) */
> > static struct kmem_cache *mm_cachep;
> >
> > -struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > -{
> > - struct vm_area_struct *vma;
> > -
> > - vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> > - if (!vma)
> > - return NULL;
> > -
> > - vma_init(vma, mm);
> > -
> > - return vma;
> > -}
> > -
> > -static void vm_area_init_from(const struct vm_area_struct *src,
> > - struct vm_area_struct *dest)
> > -{
> > - dest->vm_mm = src->vm_mm;
> > - dest->vm_ops = src->vm_ops;
> > - dest->vm_start = src->vm_start;
> > - dest->vm_end = src->vm_end;
> > - dest->anon_vma = src->anon_vma;
> > - dest->vm_pgoff = src->vm_pgoff;
> > - dest->vm_file = src->vm_file;
> > - dest->vm_private_data = src->vm_private_data;
> > - vm_flags_init(dest, src->vm_flags);
> > - memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> > - sizeof(dest->vm_page_prot));
> > - /*
> > - * src->shared.rb may be modified concurrently when called from
> > - * dup_mmap(), but the clone will reinitialize it.
> > - */
> > - data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> > - memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> > - sizeof(dest->vm_userfaultfd_ctx));
> > -#ifdef CONFIG_ANON_VMA_NAME
> > - dest->anon_name = src->anon_name;
> > -#endif
> > -#ifdef CONFIG_SWAP
> > - memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> > - sizeof(dest->swap_readahead_info));
> > -#endif
> > -#ifndef CONFIG_MMU
> > - dest->vm_region = src->vm_region;
> > -#endif
> > -#ifdef CONFIG_NUMA
> > - dest->vm_policy = src->vm_policy;
> > -#endif
> > -}
> > -
> > -struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> > -{
> > - struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> > -
> > - if (!new)
> > - return NULL;
> > -
> > - ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
> > - ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
> > - vm_area_init_from(orig, new);
> > - vma_lock_init(new, true);
> > - INIT_LIST_HEAD(&new->anon_vma_chain);
> > - vma_numab_state_init(new);
> > - dup_anon_vma_name(orig, new);
> > -
> > - return new;
> > -}
> > -
> > -void vm_area_free(struct vm_area_struct *vma)
> > -{
> > - /* The vma should be detached while being destroyed. */
> > - vma_assert_detached(vma);
> > - vma_numab_state_free(vma);
> > - free_anon_vma_name(vma);
> > - kmem_cache_free(vm_area_cachep, vma);
> > -}
> > -
> > static void account_kernel_stack(struct task_struct *tsk, int account)
> > {
> > if (IS_ENABLED(CONFIG_VMAP_STACK)) {
> > @@ -3033,11 +2954,6 @@ void __init mm_cache_init(void)
> >
> > void __init proc_caches_init(void)
> > {
> > - struct kmem_cache_args args = {
> > - .use_freeptr_offset = true,
> > - .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
> > - };
> > -
> > sighand_cachep = kmem_cache_create("sighand_cache",
> > sizeof(struct sighand_struct), 0,
> > SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
> > @@ -3054,10 +2970,6 @@ void __init proc_caches_init(void)
> > sizeof(struct fs_struct), 0,
> > SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> > NULL);
> > - vm_area_cachep = kmem_cache_create("vm_area_struct",
> > - sizeof(struct vm_area_struct), &args,
> > - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
> > - SLAB_ACCOUNT);
> > mmap_init();
> > nsproxy_cache_init();
> > }
> > diff --git a/mm/Makefile b/mm/Makefile
> > index 15a901bb431a..690ddcf7d9a1 100644
> > --- a/mm/Makefile
> > +++ b/mm/Makefile
> > @@ -55,7 +55,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
> > mm_init.o percpu.o slab_common.o \
> > compaction.o show_mem.o \
> > interval_tree.o list_lru.o workingset.o \
> > - debug.o gup.o mmap_lock.o $(mmu-y)
> > + debug.o gup.o mmap_lock.o vma_init.o $(mmu-y)
> >
> > # Give 'page_alloc' its own module-parameter namespace
> > page-alloc-y := page_alloc.o
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index 5259df031e15..81dd962a1cfc 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1554,7 +1554,7 @@ static const struct ctl_table mmap_table[] = {
> > #endif /* CONFIG_SYSCTL */
> >
> > /*
> > - * initialise the percpu counter for VM
> > + * initialise the percpu counter for VM, initialise VMA state.
> > */
> > void __init mmap_init(void)
> > {
> > @@ -1565,6 +1565,7 @@ void __init mmap_init(void)
> > #ifdef CONFIG_SYSCTL
> > register_sysctl_init("vm", mmap_table);
> > #endif
> > + vma_state_init();
> > }
> >
> > /*
> > diff --git a/mm/nommu.c b/mm/nommu.c
> > index a142fc258d39..0bf4849b8204 100644
> > --- a/mm/nommu.c
> > +++ b/mm/nommu.c
> > @@ -399,7 +399,8 @@ static const struct ctl_table nommu_table[] = {
> > };
> >
> > /*
> > - * initialise the percpu counter for VM and region record slabs
> > + * initialise the percpu counter for VM and region record slabs, initialise VMA
> > + * state.
> > */
> > void __init mmap_init(void)
> > {
> > @@ -409,6 +410,7 @@ void __init mmap_init(void)
> > VM_BUG_ON(ret);
> > vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
> > register_sysctl_init("vm", nommu_table);
> > + vma_state_init();
> > }
> >
> > /*
> > diff --git a/mm/vma.h b/mm/vma.h
> > index 94307a2e4ab6..4a1e1768ca46 100644
> > --- a/mm/vma.h
> > +++ b/mm/vma.h
> > @@ -548,8 +548,15 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address);
> >
> > int __vm_munmap(unsigned long start, size_t len, bool unlock);
> >
> > +
>
> Accidental extra line here?
>
> > int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma);
> >
> > +/* vma_init.h, shared between CONFIG_MMU and nommu. */
> > +void __init vma_state_init(void);
> > +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm);
> > +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig);
> > +void vm_area_free(struct vm_area_struct *vma);
> > +
> > /* vma_exec.h */
> > #ifdef CONFIG_MMU
> > int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
> > diff --git a/mm/vma_init.c b/mm/vma_init.c
> > new file mode 100644
> > index 000000000000..967ca8517986
> > --- /dev/null
> > +++ b/mm/vma_init.c
> > @@ -0,0 +1,101 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +
> > +/*
> > + * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared
> > + * between CONFIG_MMU and non-CONFIG_MMU kernel configurations.
> > + */
> > +
> > +#include "vma_internal.h"
> > +#include "vma.h"
> > +
> > +/* SLAB cache for vm_area_struct structures */
> > +static struct kmem_cache *vm_area_cachep;
> > +
> > +void __init vma_state_init(void)
> > +{
> > + struct kmem_cache_args args = {
> > + .use_freeptr_offset = true,
> > + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
> > + };
> > +
> > + vm_area_cachep = kmem_cache_create("vm_area_struct",
> > + sizeof(struct vm_area_struct), &args,
> > + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
> > + SLAB_ACCOUNT);
> > +}
> > +
> > +struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > +{
> > + struct vm_area_struct *vma;
> > +
> > + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> > + if (!vma)
> > + return NULL;
> > +
> > + vma_init(vma, mm);
> > +
> > + return vma;
> > +}
> > +
> > +static void vm_area_init_from(const struct vm_area_struct *src,
> > + struct vm_area_struct *dest)
> > +{
> > + dest->vm_mm = src->vm_mm;
> > + dest->vm_ops = src->vm_ops;
> > + dest->vm_start = src->vm_start;
> > + dest->vm_end = src->vm_end;
> > + dest->anon_vma = src->anon_vma;
> > + dest->vm_pgoff = src->vm_pgoff;
> > + dest->vm_file = src->vm_file;
> > + dest->vm_private_data = src->vm_private_data;
> > + vm_flags_init(dest, src->vm_flags);
> > + memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> > + sizeof(dest->vm_page_prot));
> > + /*
> > + * src->shared.rb may be modified concurrently when called from
> > + * dup_mmap(), but the clone will reinitialize it.
> > + */
> > + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> > + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> > + sizeof(dest->vm_userfaultfd_ctx));
> > +#ifdef CONFIG_ANON_VMA_NAME
> > + dest->anon_name = src->anon_name;
> > +#endif
> > +#ifdef CONFIG_SWAP
> > + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> > + sizeof(dest->swap_readahead_info));
> > +#endif
> > +#ifndef CONFIG_MMU
> > + dest->vm_region = src->vm_region;
> > +#endif
> > +#ifdef CONFIG_NUMA
> > + dest->vm_policy = src->vm_policy;
> > +#endif
> > +}
> > +
> > +struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> > +{
> > + struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
> > +
> > + if (!new)
> > + return NULL;
> > +
> > + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
> > + ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
> > + vm_area_init_from(orig, new);
> > + vma_lock_init(new, true);
> > + INIT_LIST_HEAD(&new->anon_vma_chain);
> > + vma_numab_state_init(new);
> > + dup_anon_vma_name(orig, new);
> > +
> > + return new;
> > +}
> > +
> > +void vm_area_free(struct vm_area_struct *vma)
> > +{
> > + /* The vma should be detached while being destroyed. */
> > + vma_assert_detached(vma);
> > + vma_numab_state_free(vma);
> > + free_anon_vma_name(vma);
> > + kmem_cache_free(vm_area_cachep, vma);
> > +}
> > diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile
> > index 624040fcf193..66f3831a668f 100644
> > --- a/tools/testing/vma/Makefile
> > +++ b/tools/testing/vma/Makefile
> > @@ -9,7 +9,7 @@ include ../shared/shared.mk
> > OFILES = $(SHARED_OFILES) vma.o maple-shim.o
> > TARGETS = vma
> >
> > -vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_exec.c ../../../mm/vma.h
> > +vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h
> >
> > vma: $(OFILES)
> > $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)
> > diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
> > index 5832ae5d797d..2be7597a2ac2 100644
> > --- a/tools/testing/vma/vma.c
> > +++ b/tools/testing/vma/vma.c
> > @@ -28,6 +28,7 @@ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
> > * Directly import the VMA implementation here. Our vma_internal.h wrapper
> > * provides userland-equivalent functionality for everything vma.c uses.
> > */
> > +#include "../../../mm/vma_init.c"
> > #include "../../../mm/vma_exec.c"
> > #include "../../../mm/vma.c"
> >
> > @@ -91,6 +92,12 @@ static int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma)
> > return res;
> > }
> >
> > +static void detach_free_vma(struct vm_area_struct *vma)
> > +{
> > + vma_mark_detached(vma);
> > + vm_area_free(vma);
> > +}
> > +
> > /* Helper function to allocate a VMA and link it to the tree. */
> > static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
> > unsigned long start,
> > @@ -104,7 +111,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
> > return NULL;
> >
> > if (attach_vma(mm, vma)) {
> > - vm_area_free(vma);
> > + detach_free_vma(vma);
> > return NULL;
> > }
> >
> > @@ -249,7 +256,7 @@ static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi)
> >
> > vma_iter_set(vmi, 0);
> > for_each_vma(*vmi, vma) {
> > - vm_area_free(vma);
> > + detach_free_vma(vma);
> > count++;
> > }
> >
> > @@ -319,7 +326,7 @@ static bool test_simple_merge(void)
> > ASSERT_EQ(vma->vm_pgoff, 0);
> > ASSERT_EQ(vma->vm_flags, flags);
> >
> > - vm_area_free(vma);
> > + detach_free_vma(vma);
> > mtree_destroy(&mm.mm_mt);
> >
> > return true;
> > @@ -361,7 +368,7 @@ static bool test_simple_modify(void)
> > ASSERT_EQ(vma->vm_end, 0x1000);
> > ASSERT_EQ(vma->vm_pgoff, 0);
> >
> > - vm_area_free(vma);
> > + detach_free_vma(vma);
> > vma_iter_clear(&vmi);
> >
> > vma = vma_next(&vmi);
> > @@ -370,7 +377,7 @@ static bool test_simple_modify(void)
> > ASSERT_EQ(vma->vm_end, 0x2000);
> > ASSERT_EQ(vma->vm_pgoff, 1);
> >
> > - vm_area_free(vma);
> > + detach_free_vma(vma);
> > vma_iter_clear(&vmi);
> >
> > vma = vma_next(&vmi);
> > @@ -379,7 +386,7 @@ static bool test_simple_modify(void)
> > ASSERT_EQ(vma->vm_end, 0x3000);
> > ASSERT_EQ(vma->vm_pgoff, 2);
> >
> > - vm_area_free(vma);
> > + detach_free_vma(vma);
> > mtree_destroy(&mm.mm_mt);
> >
> > return true;
> > @@ -407,7 +414,7 @@ static bool test_simple_expand(void)
> > ASSERT_EQ(vma->vm_end, 0x3000);
> > ASSERT_EQ(vma->vm_pgoff, 0);
> >
> > - vm_area_free(vma);
> > + detach_free_vma(vma);
> > mtree_destroy(&mm.mm_mt);
> >
> > return true;
> > @@ -428,7 +435,7 @@ static bool test_simple_shrink(void)
> > ASSERT_EQ(vma->vm_end, 0x1000);
> > ASSERT_EQ(vma->vm_pgoff, 0);
> >
> > - vm_area_free(vma);
> > + detach_free_vma(vma);
> > mtree_destroy(&mm.mm_mt);
> >
> > return true;
> > @@ -619,7 +626,7 @@ static bool test_merge_new(void)
> > ASSERT_EQ(vma->vm_pgoff, 0);
> > ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
> >
> > - vm_area_free(vma);
> > + detach_free_vma(vma);
> > count++;
> > }
> >
> > @@ -1668,6 +1675,7 @@ int main(void)
> > int num_tests = 0, num_fail = 0;
> >
> > maple_tree_init();
> > + vma_state_init();
> >
> > #define TEST(name) \
> > do { \
> > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> > index 32e990313158..198abe66de5a 100644
> > --- a/tools/testing/vma/vma_internal.h
> > +++ b/tools/testing/vma/vma_internal.h
> > @@ -155,6 +155,10 @@ typedef __bitwise unsigned int vm_fault_t;
> > */
> > #define pr_warn_once pr_err
> >
> > +#define data_race(expr) expr
> > +
> > +#define ASSERT_EXCLUSIVE_WRITER(x)
> > +
> > struct kref {
> > refcount_t refcount;
> > };
> > @@ -255,6 +259,8 @@ struct file {
> >
> > #define VMA_LOCK_OFFSET 0x40000000
> >
> > +typedef struct { unsigned long v; } freeptr_t;
> > +
> > struct vm_area_struct {
> > /* The first cache line has the info for VMA tree walking. */
> >
> > @@ -264,9 +270,7 @@ struct vm_area_struct {
> > unsigned long vm_start;
> > unsigned long vm_end;
> > };
> > -#ifdef CONFIG_PER_VMA_LOCK
> > - struct rcu_head vm_rcu; /* Used for deferred freeing. */
> > -#endif
> > + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
> > };
> >
> > struct mm_struct *vm_mm; /* The address space we belong to. */
> > @@ -463,6 +467,65 @@ struct pagetable_move_control {
> > .len_in = len_, \
> > }
> >
> > +struct kmem_cache_args {
> > + /**
> > + * @align: The required alignment for the objects.
> > + *
> > + * %0 means no specific alignment is requested.
> > + */
> > + unsigned int align;
> > + /**
> > + * @useroffset: Usercopy region offset.
> > + *
> > + * %0 is a valid offset, when @usersize is non-%0
> > + */
> > + unsigned int useroffset;
> > + /**
> > + * @usersize: Usercopy region size.
> > + *
> > + * %0 means no usercopy region is specified.
> > + */
> > + unsigned int usersize;
> > + /**
> > + * @freeptr_offset: Custom offset for the free pointer
> > + * in &SLAB_TYPESAFE_BY_RCU caches
> > + *
> > + * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer
> > + * outside of the object. This might cause the object to grow in size.
> > + * Cache creators that have a reason to avoid this can specify a custom
> > + * free pointer offset in their struct where the free pointer will be
> > + * placed.
> > + *
> > + * Note that placing the free pointer inside the object requires the
> > + * caller to ensure that no fields are invalidated that are required to
> > + * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for
> > + * details).
> > + *
> > + * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset
> > + * is specified, %use_freeptr_offset must be set %true.
> > + *
> > + * Note that @ctor currently isn't supported with custom free pointers
> > + * as a @ctor requires an external free pointer.
> > + */
> > + unsigned int freeptr_offset;
> > + /**
> > + * @use_freeptr_offset: Whether a @freeptr_offset is used.
> > + */
> > + bool use_freeptr_offset;
> > + /**
> > + * @ctor: A constructor for the objects.
> > + *
> > + * The constructor is invoked for each object in a newly allocated slab
> > + * page. It is the cache user's responsibility to free object in the
> > + * same state as after calling the constructor, or deal appropriately
> > + * with any differences between a freshly constructed and a reallocated
> > + * object.
> > + *
> > + * %NULL means no constructor.
> > + */
> > + void (*ctor)(void *);
> > +};
> > +
> > static inline void vma_iter_invalidate(struct vma_iterator *vmi)
> > {
> > mas_pause(&vmi->mas);
> > @@ -547,31 +610,38 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
> > vma->vm_lock_seq = UINT_MAX;
> > }
> >
> > -static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
> > -{
> > - struct vm_area_struct *vma = calloc(1, sizeof(struct vm_area_struct));
> > +struct kmem_cache {
> > + const char *name;
> > + size_t object_size;
> > + struct kmem_cache_args *args;
> > +};
> >
> > - if (!vma)
> > - return NULL;
> > +static inline struct kmem_cache *__kmem_cache_create(const char *name,
> > + size_t object_size,
> > + struct kmem_cache_args *args)
> > +{
> > + struct kmem_cache *ret = malloc(sizeof(struct kmem_cache));
> >
> > - vma_init(vma, mm);
> > + ret->name = name;
> > + ret->object_size = object_size;
> > + ret->args = args;
> >
> > - return vma;
> > + return ret;
> > }
> >
> > -static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
> > -{
> > - struct vm_area_struct *new = calloc(1, sizeof(struct vm_area_struct));
> > +#define kmem_cache_create(__name, __object_size, __args, ...) \
> > + __kmem_cache_create((__name), (__object_size), (__args))
> >
> > - if (!new)
> > - return NULL;
> > +static inline void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
> > +{
> > + (void)gfpflags;
> >
> > - memcpy(new, orig, sizeof(*new));
> > - refcount_set(&new->vm_refcnt, 0);
> > - new->vm_lock_seq = UINT_MAX;
> > - INIT_LIST_HEAD(&new->anon_vma_chain);
> > + return calloc(s->object_size, 1);
> > +}
> >
> > - return new;
> > +static inline void kmem_cache_free(struct kmem_cache *s, void *x)
> > +{
> > + free(x);
> > }
> >
> > /*
> > @@ -738,11 +808,6 @@ static inline void mpol_put(struct mempolicy *)
> > {
> > }
> >
> > -static inline void vm_area_free(struct vm_area_struct *vma)
> > -{
> > - free(vma);
> > -}
> > -
> > static inline void lru_add_drain(void)
> > {
> > }
> > @@ -1312,4 +1377,32 @@ static inline void ksm_exit(struct mm_struct *mm)
> > (void)mm;
> > }
> >
> > +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
> > +{
> > + (void)vma;
> > + (void)reset_refcnt;
> > +}
> > +
> > +static inline void vma_numab_state_init(struct vm_area_struct *vma)
> > +{
> > + (void)vma;
> > +}
> > +
> > +static inline void vma_numab_state_free(struct vm_area_struct *vma)
> > +{
> > + (void)vma;
> > +}
> > +
> > +static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
> > + struct vm_area_struct *new_vma)
> > +{
> > + (void)orig_vma;
> > + (void)new_vma;
> > +}
> > +
> > +static inline void free_anon_vma_name(struct vm_area_struct *vma)
> > +{
> > + (void)vma;
> > +}
> > +
> > #endif /* __MM_VMA_INTERNAL_H */
> > --
> > 2.49.0
> >
© 2016 - 2025 Red Hat, Inc.