Right now these are performed in kernel/fork.c which is odd and a violation
of separation of concerns, as well as preventing us from integrating this
and related logic into userland VMA testing going forward, and perhaps more
importantly - enabling us to, in a subsequent commit, make VMA
allocation/freeing a purely internal mm operation.
There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU is not set, and there is no sensible place to put these outside
of that, so we are put in the position of having to duplication some logic
here.
This isn't ideal, but since nommu is a niche use-case, already duplicates a
great deal of mmu logic by its nature and we can eliminate code that is not
applicable to nommu, it seems a worthwhile trade-off.
The intent is to move all this logic to vma.c in a subsequent commit,
rendering VMA allocation, freeing and duplication mm-internal-only and
userland testable.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
kernel/fork.c | 88 ---------------------------------------------------
mm/mmap.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++
mm/nommu.c | 67 +++++++++++++++++++++++++++++++++++++++
3 files changed, 151 insertions(+), 88 deletions(-)
diff --git a/kernel/fork.c b/kernel/fork.c
index 168681fc4b25..ebddc51624ec 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -428,88 +428,9 @@ struct kmem_cache *files_cachep;
/* SLAB cache for fs_struct structures (tsk->fs) */
struct kmem_cache *fs_cachep;
-/* SLAB cache for vm_area_struct structures */
-static struct kmem_cache *vm_area_cachep;
-
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
-struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
-{
- struct vm_area_struct *vma;
-
- vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (!vma)
- return NULL;
-
- vma_init(vma, mm);
-
- return vma;
-}
-
-static void vm_area_init_from(const struct vm_area_struct *src,
- struct vm_area_struct *dest)
-{
- dest->vm_mm = src->vm_mm;
- dest->vm_ops = src->vm_ops;
- dest->vm_start = src->vm_start;
- dest->vm_end = src->vm_end;
- dest->anon_vma = src->anon_vma;
- dest->vm_pgoff = src->vm_pgoff;
- dest->vm_file = src->vm_file;
- dest->vm_private_data = src->vm_private_data;
- vm_flags_init(dest, src->vm_flags);
- memcpy(&dest->vm_page_prot, &src->vm_page_prot,
- sizeof(dest->vm_page_prot));
- /*
- * src->shared.rb may be modified concurrently when called from
- * dup_mmap(), but the clone will reinitialize it.
- */
- data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
- memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
- sizeof(dest->vm_userfaultfd_ctx));
-#ifdef CONFIG_ANON_VMA_NAME
- dest->anon_name = src->anon_name;
-#endif
-#ifdef CONFIG_SWAP
- memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
- sizeof(dest->swap_readahead_info));
-#endif
-#ifndef CONFIG_MMU
- dest->vm_region = src->vm_region;
-#endif
-#ifdef CONFIG_NUMA
- dest->vm_policy = src->vm_policy;
-#endif
-}
-
-struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
-{
- struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-
- if (!new)
- return NULL;
-
- ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
- ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
- vm_area_init_from(orig, new);
- vma_lock_init(new, true);
- INIT_LIST_HEAD(&new->anon_vma_chain);
- vma_numab_state_init(new);
- dup_anon_vma_name(orig, new);
-
- return new;
-}
-
-void vm_area_free(struct vm_area_struct *vma)
-{
- /* The vma should be detached while being destroyed. */
- vma_assert_detached(vma);
- vma_numab_state_free(vma);
- free_anon_vma_name(vma);
- kmem_cache_free(vm_area_cachep, vma);
-}
-
static void account_kernel_stack(struct task_struct *tsk, int account)
{
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@ -3214,11 +3135,6 @@ void __init mm_cache_init(void)
void __init proc_caches_init(void)
{
- struct kmem_cache_args args = {
- .use_freeptr_offset = true,
- .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
- };
-
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -3235,10 +3151,6 @@ void __init proc_caches_init(void)
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
- vm_area_cachep = kmem_cache_create("vm_area_struct",
- sizeof(struct vm_area_struct), &args,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
- SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 1289c6381419..fbddcd082a93 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -77,6 +77,82 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
+/* SLAB cache for vm_area_struct structures */
+static struct kmem_cache *vm_area_cachep;
+
+struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ if (!vma)
+ return NULL;
+
+ vma_init(vma, mm);
+
+ return vma;
+}
+
+static void vm_area_init_from(const struct vm_area_struct *src,
+ struct vm_area_struct *dest)
+{
+ dest->vm_mm = src->vm_mm;
+ dest->vm_ops = src->vm_ops;
+ dest->vm_start = src->vm_start;
+ dest->vm_end = src->vm_end;
+ dest->anon_vma = src->anon_vma;
+ dest->vm_pgoff = src->vm_pgoff;
+ dest->vm_file = src->vm_file;
+ dest->vm_private_data = src->vm_private_data;
+ vm_flags_init(dest, src->vm_flags);
+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
+ sizeof(dest->vm_page_prot));
+ /*
+ * src->shared.rb may be modified concurrently when called from
+ * dup_mmap(), but the clone will reinitialize it.
+ */
+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
+ memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
+ sizeof(dest->vm_userfaultfd_ctx));
+#ifdef CONFIG_ANON_VMA_NAME
+ dest->anon_name = src->anon_name;
+#endif
+#ifdef CONFIG_SWAP
+ memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
+ sizeof(dest->swap_readahead_info));
+#endif
+#ifdef CONFIG_NUMA
+ dest->vm_policy = src->vm_policy;
+#endif
+}
+
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
+{
+ struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+
+ if (!new)
+ return NULL;
+
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ vm_area_init_from(orig, new);
+ vma_lock_init(new, true);
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+ vma_numab_state_init(new);
+ dup_anon_vma_name(orig, new);
+
+ return new;
+}
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+ /* The vma should be detached while being destroyed. */
+ vma_assert_detached(vma);
+ vma_numab_state_free(vma);
+ free_anon_vma_name(vma);
+ kmem_cache_free(vm_area_cachep, vma);
+}
+
/* Update vma->vm_page_prot to reflect vma->vm_flags. */
void vma_set_page_prot(struct vm_area_struct *vma)
{
@@ -1601,9 +1677,17 @@ static const struct ctl_table mmap_table[] = {
void __init mmap_init(void)
{
int ret;
+ struct kmem_cache_args args = {
+ .use_freeptr_offset = true,
+ .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
+ };
ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
+ vm_area_cachep = kmem_cache_create("vm_area_struct",
+ sizeof(struct vm_area_struct), &args,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
+ SLAB_ACCOUNT);
#ifdef CONFIG_SYSCTL
register_sysctl_init("vm", mmap_table);
#endif
diff --git a/mm/nommu.c b/mm/nommu.c
index 2b4d304c6445..2b3722266222 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -53,6 +53,65 @@ static struct kmem_cache *vm_region_jar;
struct rb_root nommu_region_tree = RB_ROOT;
DECLARE_RWSEM(nommu_region_sem);
+/* SLAB cache for vm_area_struct structures */
+static struct kmem_cache *vm_area_cachep;
+
+vm_area_struct *vm_area_alloc(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ if (!vma)
+ return NULL;
+
+ vma_init(vma, mm);
+
+ return vma;
+}
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+ kmem_cache_free(vm_area_cachep, vma);
+}
+
+static void vm_area_init_from(const struct vm_area_struct *src,
+ struct vm_area_struct *dest)
+{
+ dest->vm_mm = src->vm_mm;
+ dest->vm_ops = src->vm_ops;
+ dest->vm_start = src->vm_start;
+ dest->vm_end = src->vm_end;
+ dest->anon_vma = src->anon_vma;
+ dest->vm_pgoff = src->vm_pgoff;
+ dest->vm_file = src->vm_file;
+ dest->vm_private_data = src->vm_private_data;
+ vm_flags_init(dest, src->vm_flags);
+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
+ sizeof(dest->vm_page_prot));
+ /*
+ * src->shared.rb may be modified concurrently when called from
+ * dup_mmap(), but the clone will reinitialize it.
+ */
+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
+
+ dest->vm_region = src->vm_region;
+}
+
+struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
+{
+ struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+
+ if (!new)
+ return NULL;
+
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ vm_area_init_from(orig, new);
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+
+ return new;
+}
+
const struct vm_operations_struct generic_file_vm_ops = {
};
@@ -404,10 +463,18 @@ static const struct ctl_table nommu_table[] = {
void __init mmap_init(void)
{
int ret;
+ struct kmem_cache_args args = {
+ .use_freeptr_offset = true,
+ .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
+ };
ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
+ vm_area_cachep = kmem_cache_create("vm_area_struct",
+ sizeof(struct vm_area_struct), &args,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
+ SLAB_ACCOUNT);
register_sysctl_init("vm", nommu_table);
}
--
2.49.0
On April 24, 2025 2:15:27 PM PDT, Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
>+static void vm_area_init_from(const struct vm_area_struct *src,
>+ struct vm_area_struct *dest)
>+{
>+ dest->vm_mm = src->vm_mm;
>+ dest->vm_ops = src->vm_ops;
>+ dest->vm_start = src->vm_start;
>+ dest->vm_end = src->vm_end;
>+ dest->anon_vma = src->anon_vma;
>+ dest->vm_pgoff = src->vm_pgoff;
>+ dest->vm_file = src->vm_file;
>+ dest->vm_private_data = src->vm_private_data;
>+ vm_flags_init(dest, src->vm_flags);
>+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
>+ sizeof(dest->vm_page_prot));
>+ /*
>+ * src->shared.rb may be modified concurrently when called from
>+ * dup_mmap(), but the clone will reinitialize it.
>+ */
>+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
>+ memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
>+ sizeof(dest->vm_userfaultfd_ctx));
>+#ifdef CONFIG_ANON_VMA_NAME
>+ dest->anon_name = src->anon_name;
>+#endif
>+#ifdef CONFIG_SWAP
>+ memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
>+ sizeof(dest->swap_readahead_info));
>+#endif
>+#ifdef CONFIG_NUMA
>+ dest->vm_policy = src->vm_policy;
>+#endif
>+}
I know you're doing a big cut/paste here, but why in the world is this function written this way? Why not just:
*dest = *src;
And then do any one-off cleanups?
--
Kees Cook
On Thu, Apr 24, 2025 at 08:15:26PM -0700, Kees Cook wrote:
>
>
> On April 24, 2025 2:15:27 PM PDT, Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> >+static void vm_area_init_from(const struct vm_area_struct *src,
> >+ struct vm_area_struct *dest)
> >+{
> >+ dest->vm_mm = src->vm_mm;
> >+ dest->vm_ops = src->vm_ops;
> >+ dest->vm_start = src->vm_start;
> >+ dest->vm_end = src->vm_end;
> >+ dest->anon_vma = src->anon_vma;
> >+ dest->vm_pgoff = src->vm_pgoff;
> >+ dest->vm_file = src->vm_file;
> >+ dest->vm_private_data = src->vm_private_data;
> >+ vm_flags_init(dest, src->vm_flags);
> >+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> >+ sizeof(dest->vm_page_prot));
> >+ /*
> >+ * src->shared.rb may be modified concurrently when called from
> >+ * dup_mmap(), but the clone will reinitialize it.
> >+ */
> >+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> >+ memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> >+ sizeof(dest->vm_userfaultfd_ctx));
> >+#ifdef CONFIG_ANON_VMA_NAME
> >+ dest->anon_name = src->anon_name;
> >+#endif
> >+#ifdef CONFIG_SWAP
> >+ memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> >+ sizeof(dest->swap_readahead_info));
> >+#endif
> >+#ifdef CONFIG_NUMA
> >+ dest->vm_policy = src->vm_policy;
> >+#endif
> >+}
>
> I know you're doing a big cut/paste here, but why in the world is this function written this way? Why not just:
>
> *dest = *src;
>
> And then do any one-off cleanups?
Yup I find it odd, and error prone to be honest. We'll end up with uninitialised
state for some fields if we miss them here, seems unwise...
Presumably for performance?
This is, as you say, me simply propagating what exists, but I do wonder.
>
>
> --
> Kees Cook
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250425 06:40]:
> On Thu, Apr 24, 2025 at 08:15:26PM -0700, Kees Cook wrote:
> >
> >
> > On April 24, 2025 2:15:27 PM PDT, Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> > >+static void vm_area_init_from(const struct vm_area_struct *src,
> > >+ struct vm_area_struct *dest)
> > >+{
> > >+ dest->vm_mm = src->vm_mm;
> > >+ dest->vm_ops = src->vm_ops;
> > >+ dest->vm_start = src->vm_start;
> > >+ dest->vm_end = src->vm_end;
> > >+ dest->anon_vma = src->anon_vma;
> > >+ dest->vm_pgoff = src->vm_pgoff;
> > >+ dest->vm_file = src->vm_file;
> > >+ dest->vm_private_data = src->vm_private_data;
> > >+ vm_flags_init(dest, src->vm_flags);
> > >+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> > >+ sizeof(dest->vm_page_prot));
> > >+ /*
> > >+ * src->shared.rb may be modified concurrently when called from
> > >+ * dup_mmap(), but the clone will reinitialize it.
> > >+ */
> > >+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> > >+ memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> > >+ sizeof(dest->vm_userfaultfd_ctx));
> > >+#ifdef CONFIG_ANON_VMA_NAME
> > >+ dest->anon_name = src->anon_name;
> > >+#endif
> > >+#ifdef CONFIG_SWAP
> > >+ memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> > >+ sizeof(dest->swap_readahead_info));
> > >+#endif
> > >+#ifdef CONFIG_NUMA
> > >+ dest->vm_policy = src->vm_policy;
> > >+#endif
> > >+}
> >
> > I know you're doing a big cut/paste here, but why in the world is this function written this way? Why not just:
> >
> > *dest = *src;
> >
> > And then do any one-off cleanups?
>
> Yup I find it odd, and error prone to be honest. We'll end up with uninitialised
> state for some fields if we miss them here, seems unwise...
>
> Presumably for performance?
>
> This is, as you say, me simply propagating what exists, but I do wonder.
Two things come to mind:
1. How ctors are done. (v3 of Suren's RCU safe patch series, willy made
a comment.. I think)
2. Some race that Vlastimil came up with the copy and the RCU safeness.
IIRC it had to do with the ordering of the setting of things?
Also, looking at it again...
How is it safe to do dest->anon_name = src->anon_name? Isn't that ref
counted?
Pretty sure it's okay, but Suren would know for sure on all of this.
Suren, maybe you could send a patch with comments on this stuff?
Thanks,
Liam
On Fri, Apr 25, 2025 at 6:55 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250425 06:40]:
> > On Thu, Apr 24, 2025 at 08:15:26PM -0700, Kees Cook wrote:
> > >
> > >
> > > On April 24, 2025 2:15:27 PM PDT, Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> > > >+static void vm_area_init_from(const struct vm_area_struct *src,
> > > >+ struct vm_area_struct *dest)
> > > >+{
> > > >+ dest->vm_mm = src->vm_mm;
> > > >+ dest->vm_ops = src->vm_ops;
> > > >+ dest->vm_start = src->vm_start;
> > > >+ dest->vm_end = src->vm_end;
> > > >+ dest->anon_vma = src->anon_vma;
> > > >+ dest->vm_pgoff = src->vm_pgoff;
> > > >+ dest->vm_file = src->vm_file;
> > > >+ dest->vm_private_data = src->vm_private_data;
> > > >+ vm_flags_init(dest, src->vm_flags);
> > > >+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> > > >+ sizeof(dest->vm_page_prot));
> > > >+ /*
> > > >+ * src->shared.rb may be modified concurrently when called from
> > > >+ * dup_mmap(), but the clone will reinitialize it.
> > > >+ */
> > > >+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> > > >+ memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> > > >+ sizeof(dest->vm_userfaultfd_ctx));
> > > >+#ifdef CONFIG_ANON_VMA_NAME
> > > >+ dest->anon_name = src->anon_name;
> > > >+#endif
> > > >+#ifdef CONFIG_SWAP
> > > >+ memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> > > >+ sizeof(dest->swap_readahead_info));
> > > >+#endif
> > > >+#ifdef CONFIG_NUMA
> > > >+ dest->vm_policy = src->vm_policy;
> > > >+#endif
> > > >+}
> > >
> > > I know you're doing a big cut/paste here, but why in the world is this function written this way? Why not just:
> > >
> > > *dest = *src;
> > >
> > > And then do any one-off cleanups?
> >
> > Yup I find it odd, and error prone to be honest. We'll end up with uninitialised
> > state for some fields if we miss them here, seems unwise...
> >
> > Presumably for performance?
> >
> > This is, as you say, me simply propagating what exists, but I do wonder.
>
> Two things come to mind:
>
> 1. How ctors are done. (v3 of Suren's RCU safe patch series, willy made
> a comment.. I think)
>
> 2. Some race that Vlastimil came up with the copy and the RCU safeness.
> IIRC it had to do with the ordering of the setting of things?
>
> Also, looking at it again...
>
> How is it safe to do dest->anon_name = src->anon_name? Isn't that ref
> counted?
dest->anon_name = src->anon_name is fine here because right after
vm_area_init_from() we call dup_anon_vma_name() which will bump up the
refcount. I don't recall why this is done this way but now looking at
it I wonder if I could call dup_anon_vma_name() directly instead of
this assignment. Might be just an overlooked legacy from the time we
memcpy'd the entire structure. I'll need to double-check.
>
> Pretty sure it's okay, but Suren would know for sure on all of this.
>
> Suren, maybe you could send a patch with comments on this stuff?
Yeah, I think I need to add some comments in this code for
clarification. We do not copy the entire vm_area_struct because we
have to preserve vma->vm_refcnt field of the dest vma. Since these
structures are allocated from a cache with SLAB_TYPESAFE_BY_RCU,
another thread might be concurrently checking the state of the dest
object by reading dest->vm_refcnt. Therefore it's important here not
to override the vm_refcnt. Changelog in
https://lore.kernel.org/all/20250213224655.1680278-18-surenb@google.com/
touches on it but a comment in the code would be indeed helpful. Will
add it but will wait for Lorenzo's refactoring to land into linux-mm
first to avoid adding merge conflicts.
>
> Thanks,
> Liam
On Fri, Apr 25, 2025 at 08:32:48AM -0700, Suren Baghdasaryan wrote:
> On Fri, Apr 25, 2025 at 6:55 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250425 06:40]:
> > > On Thu, Apr 24, 2025 at 08:15:26PM -0700, Kees Cook wrote:
> > > >
> > > >
> > > > On April 24, 2025 2:15:27 PM PDT, Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> > > > >+static void vm_area_init_from(const struct vm_area_struct *src,
> > > > >+ struct vm_area_struct *dest)
> > > > >+{
> > > > >+ dest->vm_mm = src->vm_mm;
> > > > >+ dest->vm_ops = src->vm_ops;
> > > > >+ dest->vm_start = src->vm_start;
> > > > >+ dest->vm_end = src->vm_end;
> > > > >+ dest->anon_vma = src->anon_vma;
> > > > >+ dest->vm_pgoff = src->vm_pgoff;
> > > > >+ dest->vm_file = src->vm_file;
> > > > >+ dest->vm_private_data = src->vm_private_data;
> > > > >+ vm_flags_init(dest, src->vm_flags);
> > > > >+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> > > > >+ sizeof(dest->vm_page_prot));
> > > > >+ /*
> > > > >+ * src->shared.rb may be modified concurrently when called from
> > > > >+ * dup_mmap(), but the clone will reinitialize it.
> > > > >+ */
> > > > >+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> > > > >+ memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> > > > >+ sizeof(dest->vm_userfaultfd_ctx));
> > > > >+#ifdef CONFIG_ANON_VMA_NAME
> > > > >+ dest->anon_name = src->anon_name;
> > > > >+#endif
> > > > >+#ifdef CONFIG_SWAP
> > > > >+ memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> > > > >+ sizeof(dest->swap_readahead_info));
> > > > >+#endif
> > > > >+#ifdef CONFIG_NUMA
> > > > >+ dest->vm_policy = src->vm_policy;
> > > > >+#endif
> > > > >+}
> > > >
> > > > I know you're doing a big cut/paste here, but why in the world is this function written this way? Why not just:
> > > >
> > > > *dest = *src;
> > > >
> > > > And then do any one-off cleanups?
> > >
> > > Yup I find it odd, and error prone to be honest. We'll end up with uninitialised
> > > state for some fields if we miss them here, seems unwise...
> > >
> > > Presumably for performance?
> > >
> > > This is, as you say, me simply propagating what exists, but I do wonder.
> >
> > Two things come to mind:
> >
> > 1. How ctors are done. (v3 of Suren's RCU safe patch series, willy made
> > a comment.. I think)
> >
> > 2. Some race that Vlastimil came up with the copy and the RCU safeness.
> > IIRC it had to do with the ordering of the setting of things?
> >
> > Also, looking at it again...
> >
> > How is it safe to do dest->anon_name = src->anon_name? Isn't that ref
> > counted?
>
> dest->anon_name = src->anon_name is fine here because right after
> vm_area_init_from() we call dup_anon_vma_name() which will bump up the
> refcount. I don't recall why this is done this way but now looking at
> it I wonder if I could call dup_anon_vma_name() directly instead of
> this assignment. Might be just an overlooked legacy from the time we
> memcpy'd the entire structure. I'll need to double-check.
Oh, is "dest" accessible to other CPU threads? I hadn't looked and was
assuming this was like process creation where everything gets built in
isolation and then attached to the main process tree. I was thinking
this was similar.
--
Kees Cook
On Fri, Apr 25, 2025 at 10:12 AM Kees Cook <kees@kernel.org> wrote:
>
> On Fri, Apr 25, 2025 at 08:32:48AM -0700, Suren Baghdasaryan wrote:
> > On Fri, Apr 25, 2025 at 6:55 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> > >
> > > * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250425 06:40]:
> > > > On Thu, Apr 24, 2025 at 08:15:26PM -0700, Kees Cook wrote:
> > > > >
> > > > >
> > > > > On April 24, 2025 2:15:27 PM PDT, Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> > > > > >+static void vm_area_init_from(const struct vm_area_struct *src,
> > > > > >+ struct vm_area_struct *dest)
> > > > > >+{
> > > > > >+ dest->vm_mm = src->vm_mm;
> > > > > >+ dest->vm_ops = src->vm_ops;
> > > > > >+ dest->vm_start = src->vm_start;
> > > > > >+ dest->vm_end = src->vm_end;
> > > > > >+ dest->anon_vma = src->anon_vma;
> > > > > >+ dest->vm_pgoff = src->vm_pgoff;
> > > > > >+ dest->vm_file = src->vm_file;
> > > > > >+ dest->vm_private_data = src->vm_private_data;
> > > > > >+ vm_flags_init(dest, src->vm_flags);
> > > > > >+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> > > > > >+ sizeof(dest->vm_page_prot));
> > > > > >+ /*
> > > > > >+ * src->shared.rb may be modified concurrently when called from
> > > > > >+ * dup_mmap(), but the clone will reinitialize it.
> > > > > >+ */
> > > > > >+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> > > > > >+ memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> > > > > >+ sizeof(dest->vm_userfaultfd_ctx));
> > > > > >+#ifdef CONFIG_ANON_VMA_NAME
> > > > > >+ dest->anon_name = src->anon_name;
> > > > > >+#endif
> > > > > >+#ifdef CONFIG_SWAP
> > > > > >+ memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> > > > > >+ sizeof(dest->swap_readahead_info));
> > > > > >+#endif
> > > > > >+#ifdef CONFIG_NUMA
> > > > > >+ dest->vm_policy = src->vm_policy;
> > > > > >+#endif
> > > > > >+}
> > > > >
> > > > > I know you're doing a big cut/paste here, but why in the world is this function written this way? Why not just:
> > > > >
> > > > > *dest = *src;
> > > > >
> > > > > And then do any one-off cleanups?
> > > >
> > > > Yup I find it odd, and error prone to be honest. We'll end up with uninitialised
> > > > state for some fields if we miss them here, seems unwise...
> > > >
> > > > Presumably for performance?
> > > >
> > > > This is, as you say, me simply propagating what exists, but I do wonder.
> > >
> > > Two things come to mind:
> > >
> > > 1. How ctors are done. (v3 of Suren's RCU safe patch series, willy made
> > > a comment.. I think)
> > >
> > > 2. Some race that Vlastimil came up with the copy and the RCU safeness.
> > > IIRC it had to do with the ordering of the setting of things?
> > >
> > > Also, looking at it again...
> > >
> > > How is it safe to do dest->anon_name = src->anon_name? Isn't that ref
> > > counted?
> >
> > dest->anon_name = src->anon_name is fine here because right after
> > vm_area_init_from() we call dup_anon_vma_name() which will bump up the
> > refcount. I don't recall why this is done this way but now looking at
> > it I wonder if I could call dup_anon_vma_name() directly instead of
> > this assignment. Might be just an overlooked legacy from the time we
> > memcpy'd the entire structure. I'll need to double-check.
>
> Oh, is "dest" accessible to other CPU threads? I hadn't looked and was
> assuming this was like process creation where everything gets built in
> isolation and then attached to the main process tree. I was thinking
> this was similar.
Yeah, it's process creation time but this structure is created from a
SLAB_TYPESAFE_BY_RCU cache which adds complexity. A newly allocated
object from this cache might be still accessible from another thread
holding a reference to its earlier incarnation. We need an indication
for that other thread to say "this object has been released, so the
reference you are holding is pointing to a freed or reallocated/wrong
object". vm_refcnt in this case is this indication and we are careful
not to override it even temporarily during object initialization.
Well, in truth we override it later with 0 but for the other thread
that will still mean this object is not what it wants.
I suspect you know this already but just in case
https://elixir.bootlin.com/linux/v6.14.3/source/include/linux/slab.h#L101
has more detailed explanation of SLAB_TYPESAFE_BY_RCU.
>
> --
> Kees Cook
On Fri, Apr 25, 2025 at 8:32 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Fri, Apr 25, 2025 at 6:55 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250425 06:40]:
> > > On Thu, Apr 24, 2025 at 08:15:26PM -0700, Kees Cook wrote:
> > > >
> > > >
> > > > On April 24, 2025 2:15:27 PM PDT, Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> > > > >+static void vm_area_init_from(const struct vm_area_struct *src,
> > > > >+ struct vm_area_struct *dest)
> > > > >+{
> > > > >+ dest->vm_mm = src->vm_mm;
> > > > >+ dest->vm_ops = src->vm_ops;
> > > > >+ dest->vm_start = src->vm_start;
> > > > >+ dest->vm_end = src->vm_end;
> > > > >+ dest->anon_vma = src->anon_vma;
> > > > >+ dest->vm_pgoff = src->vm_pgoff;
> > > > >+ dest->vm_file = src->vm_file;
> > > > >+ dest->vm_private_data = src->vm_private_data;
> > > > >+ vm_flags_init(dest, src->vm_flags);
> > > > >+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> > > > >+ sizeof(dest->vm_page_prot));
> > > > >+ /*
> > > > >+ * src->shared.rb may be modified concurrently when called from
> > > > >+ * dup_mmap(), but the clone will reinitialize it.
> > > > >+ */
> > > > >+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> > > > >+ memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> > > > >+ sizeof(dest->vm_userfaultfd_ctx));
> > > > >+#ifdef CONFIG_ANON_VMA_NAME
> > > > >+ dest->anon_name = src->anon_name;
> > > > >+#endif
> > > > >+#ifdef CONFIG_SWAP
> > > > >+ memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> > > > >+ sizeof(dest->swap_readahead_info));
> > > > >+#endif
> > > > >+#ifdef CONFIG_NUMA
> > > > >+ dest->vm_policy = src->vm_policy;
> > > > >+#endif
> > > > >+}
> > > >
> > > > I know you're doing a big cut/paste here, but why in the world is this function written this way? Why not just:
> > > >
> > > > *dest = *src;
> > > >
> > > > And then do any one-off cleanups?
> > >
> > > Yup I find it odd, and error prone to be honest. We'll end up with uninitialised
> > > state for some fields if we miss them here, seems unwise...
> > >
> > > Presumably for performance?
> > >
> > > This is, as you say, me simply propagating what exists, but I do wonder.
> >
> > Two things come to mind:
> >
> > 1. How ctors are done. (v3 of Suren's RCU safe patch series, willy made
> > a comment.. I think)
> >
> > 2. Some race that Vlastimil came up with the copy and the RCU safeness.
> > IIRC it had to do with the ordering of the setting of things?
> >
> > Also, looking at it again...
> >
> > How is it safe to do dest->anon_name = src->anon_name? Isn't that ref
> > counted?
>
> dest->anon_name = src->anon_name is fine here because right after
> vm_area_init_from() we call dup_anon_vma_name() which will bump up the
> refcount. I don't recall why this is done this way but now looking at
> it I wonder if I could call dup_anon_vma_name() directly instead of
> this assignment. Might be just an overlooked legacy from the time we
> memcpy'd the entire structure. I'll need to double-check.
>
> >
> > Pretty sure it's okay, but Suren would know for sure on all of this.
> >
> > Suren, maybe you could send a patch with comments on this stuff?
>
> Yeah, I think I need to add some comments in this code for
> clarification. We do not copy the entire vm_area_struct because we
> have to preserve vma->vm_refcnt field of the dest vma. Since these
> structures are allocated from a cache with SLAB_TYPESAFE_BY_RCU,
> another thread might be concurrently checking the state of the dest
> object by reading dest->vm_refcnt. Therefore it's important here not
> to override the vm_refcnt. Changelog in
> https://lore.kernel.org/all/20250213224655.1680278-18-surenb@google.com/
> touches on it but a comment in the code would be indeed helpful. Will
> add it but will wait for Lorenzo's refactoring to land into linux-mm
s/linux-mm/mm-unstable. I need my morning coffee.
> first to avoid adding merge conflicts.
>
> >
> > Thanks,
> > Liam
On Fri, Apr 25, 2025 at 11:40:00AM +0100, Lorenzo Stoakes wrote:
> On Thu, Apr 24, 2025 at 08:15:26PM -0700, Kees Cook wrote:
> >
> >
> > On April 24, 2025 2:15:27 PM PDT, Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> > >+static void vm_area_init_from(const struct vm_area_struct *src,
> > >+ struct vm_area_struct *dest)
> > >+{
> > >+ dest->vm_mm = src->vm_mm;
> > >+ dest->vm_ops = src->vm_ops;
> > >+ dest->vm_start = src->vm_start;
> > >+ dest->vm_end = src->vm_end;
> > >+ dest->anon_vma = src->anon_vma;
> > >+ dest->vm_pgoff = src->vm_pgoff;
> > >+ dest->vm_file = src->vm_file;
> > >+ dest->vm_private_data = src->vm_private_data;
> > >+ vm_flags_init(dest, src->vm_flags);
> > >+ memcpy(&dest->vm_page_prot, &src->vm_page_prot,
> > >+ sizeof(dest->vm_page_prot));
> > >+ /*
> > >+ * src->shared.rb may be modified concurrently when called from
> > >+ * dup_mmap(), but the clone will reinitialize it.
> > >+ */
> > >+ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
> > >+ memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
> > >+ sizeof(dest->vm_userfaultfd_ctx));
> > >+#ifdef CONFIG_ANON_VMA_NAME
> > >+ dest->anon_name = src->anon_name;
> > >+#endif
> > >+#ifdef CONFIG_SWAP
> > >+ memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
> > >+ sizeof(dest->swap_readahead_info));
> > >+#endif
> > >+#ifdef CONFIG_NUMA
> > >+ dest->vm_policy = src->vm_policy;
> > >+#endif
> > >+}
> >
> > I know you're doing a big cut/paste here, but why in the world is this function written this way? Why not just:
> >
> > *dest = *src;
> >
> > And then do any one-off cleanups?
>
> Yup I find it odd, and error prone to be honest. We'll end up with uninitialised
> state for some fields if we miss them here, seems unwise...
>
> Presumably for performance?
>
> This is, as you say, me simply propagating what exists, but I do wonder.
There's a particular advantage here: KMSAN will light up in all sorts of ways
if you forget to copy something explicitly, instead of silently working but also
possibly being silently broken.
Anyway, it came from here: https://lore.kernel.org/all/CAJuCfpFO3Hj+7f10e0Pnvf0U7-dHeYgvjK+4AFD8V=kmG4JA=w@mail.gmail.com/
--
Pedro
On 24.04.25 23:15, Lorenzo Stoakes wrote: > Right now these are performed in kernel/fork.c which is odd and a violation > of separation of concerns, as well as preventing us from integrating this > and related logic into userland VMA testing going forward, and perhaps more > importantly - enabling us to, in a subsequent commit, make VMA > allocation/freeing a purely internal mm operation. > > There is a fly in the ointment - nommu - mmap.c is not compiled if > CONFIG_MMU is not set, and there is no sensible place to put these outside > of that, so we are put in the position of having to duplication some logic > here. > > This isn't ideal, but since nommu is a niche use-case, already duplicates a > great deal of mmu logic by its nature and we can eliminate code that is not > applicable to nommu, it seems a worthwhile trade-off. > > The intent is to move all this logic to vma.c in a subsequent commit, > rendering VMA allocation, freeing and duplication mm-internal-only and > userland testable. I'm pretty sure you tried it, but what's the big blocker to have patch #3 first, so we can avoid the temporary move of the code to mmap.c ? -- Cheers, David / dhildenb
On Thu, Apr 24, 2025 at 11:22:26PM +0200, David Hildenbrand wrote: > On 24.04.25 23:15, Lorenzo Stoakes wrote: > > Right now these are performed in kernel/fork.c which is odd and a violation > > of separation of concerns, as well as preventing us from integrating this > > and related logic into userland VMA testing going forward, and perhaps more > > importantly - enabling us to, in a subsequent commit, make VMA > > allocation/freeing a purely internal mm operation. > > > > There is a fly in the ointment - nommu - mmap.c is not compiled if > > CONFIG_MMU is not set, and there is no sensible place to put these outside > > of that, so we are put in the position of having to duplication some logic > > here. > > > > This isn't ideal, but since nommu is a niche use-case, already duplicates a > > great deal of mmu logic by its nature and we can eliminate code that is not > > applicable to nommu, it seems a worthwhile trade-off. > > > > The intent is to move all this logic to vma.c in a subsequent commit, > > rendering VMA allocation, freeing and duplication mm-internal-only and > > userland testable. > > I'm pretty sure you tried it, but what's the big blocker to have patch #3 > first, so we can avoid the temporary move of the code to mmap.c ? You know, I didn't :P it was late etc. ;) But you're right, will rejig accordingly! > > -- > Cheers, > > David / dhildenb >
On Thu, Apr 24, 2025 at 2:22 PM David Hildenbrand <david@redhat.com> wrote:
>
> On 24.04.25 23:15, Lorenzo Stoakes wrote:
> > Right now these are performed in kernel/fork.c which is odd and a violation
> > of separation of concerns, as well as preventing us from integrating this
> > and related logic into userland VMA testing going forward, and perhaps more
> > importantly - enabling us to, in a subsequent commit, make VMA
> > allocation/freeing a purely internal mm operation.
> >
> > There is a fly in the ointment - nommu - mmap.c is not compiled if
> > CONFIG_MMU is not set, and there is no sensible place to put these outside
> > of that, so we are put in the position of having to duplication some logic
s/to duplication/to duplicate
> > here.
> >
> > This isn't ideal, but since nommu is a niche use-case, already duplicates a
> > great deal of mmu logic by its nature and we can eliminate code that is not
> > applicable to nommu, it seems a worthwhile trade-off.
> >
> > The intent is to move all this logic to vma.c in a subsequent commit,
> > rendering VMA allocation, freeing and duplication mm-internal-only and
> > userland testable.
>
> I'm pretty sure you tried it, but what's the big blocker to have patch
> #3 first, so we can avoid the temporary move of the code to mmap.c ?
Completely agree with David.
I peeked into 4/4 and it seems you want to keep vma.c completely
CONFIG_MMU-centric. I know we treat NOMMU as an unwanted child but
IMHO it would be much cleaner to move these functions into vma.c from
the beginning and have an #ifdef CONFIG_MMU there like this:
mm/vma.c
/* Functions identical for MMU/NOMMU */
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) {...}
void __init vma_state_init(void) {...}
#ifdef CONFIG_MMU
static void vm_area_init_from(const struct vm_area_struct *src,
struct vm_area_struct *dest) {...}
struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) {...}
void vm_area_free(struct vm_area_struct *vma) {...}
#else /* CONFIG_MMU */
static void vm_area_init_from(const struct vm_area_struct *src,
struct vm_area_struct *dest) {...}
struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) {...}
void vm_area_free(struct vm_area_struct *vma) {...}
#endif /* CONFIG_MMU */
>
> --
> Cheers,
>
> David / dhildenb
>
On Thu, Apr 24, 2025 at 06:22:30PM -0700, Suren Baghdasaryan wrote:
> On Thu, Apr 24, 2025 at 2:22 PM David Hildenbrand <david@redhat.com> wrote:
> >
> > On 24.04.25 23:15, Lorenzo Stoakes wrote:
> > > Right now these are performed in kernel/fork.c which is odd and a violation
> > > of separation of concerns, as well as preventing us from integrating this
> > > and related logic into userland VMA testing going forward, and perhaps more
> > > importantly - enabling us to, in a subsequent commit, make VMA
> > > allocation/freeing a purely internal mm operation.
> > >
> > > There is a fly in the ointment - nommu - mmap.c is not compiled if
> > > CONFIG_MMU is not set, and there is no sensible place to put these outside
> > > of that, so we are put in the position of having to duplication some logic
>
> s/to duplication/to duplicate
Ack will fix!
>
> > > here.
> > >
> > > This isn't ideal, but since nommu is a niche use-case, already duplicates a
> > > great deal of mmu logic by its nature and we can eliminate code that is not
> > > applicable to nommu, it seems a worthwhile trade-off.
> > >
> > > The intent is to move all this logic to vma.c in a subsequent commit,
> > > rendering VMA allocation, freeing and duplication mm-internal-only and
> > > userland testable.
> >
> > I'm pretty sure you tried it, but what's the big blocker to have patch
> > #3 first, so we can avoid the temporary move of the code to mmap.c ?
>
> Completely agree with David.
Ack! Yes this was a little bit of a silly one :P
> I peeked into 4/4 and it seems you want to keep vma.c completely
> CONFIG_MMU-centric. I know we treat NOMMU as an unwanted child but
> IMHO it would be much cleaner to move these functions into vma.c from
> the beginning and have an #ifdef CONFIG_MMU there like this:
>
> mm/vma.c
This isn't really workable, as the _entire file_ basically contains
CONFIG_MMU-specific stuff. so it'd be one huge #ifdef CONFIG_MMU block with
one small #else block. It'd also be asking for bugs and issues in nommu.
I think doing it this way fits the patterns we have established for
nommu/mmap separation, and I would say nommu is enough of a niche edge case
for us to really not want to have to go to great lengths to find ways of
sharing code.
I am quite concerned about us having to consider it and deal with issues
around it so often, so want to try to avoid that as much as we can,
ideally.
>
> /* Functions identical for MMU/NOMMU */
> struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) {...}
> void __init vma_state_init(void) {...}
>
> #ifdef CONFIG_MMU
> static void vm_area_init_from(const struct vm_area_struct *src,
> struct vm_area_struct *dest) {...}
> struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) {...}
> void vm_area_free(struct vm_area_struct *vma) {...}
> #else /* CONFIG_MMU */
> static void vm_area_init_from(const struct vm_area_struct *src,
> struct vm_area_struct *dest) {...}
> struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) {...}
> void vm_area_free(struct vm_area_struct *vma) {...}
> #endif /* CONFIG_MMU */
>
>
>
>
>
> >
> > --
> > Cheers,
> >
> > David / dhildenb
> >
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250425 06:09]: > On Thu, Apr 24, 2025 at 06:22:30PM -0700, Suren Baghdasaryan wrote: > > On Thu, Apr 24, 2025 at 2:22 PM David Hildenbrand <david@redhat.com> wrote: > > > > > > On 24.04.25 23:15, Lorenzo Stoakes wrote: > > > > Right now these are performed in kernel/fork.c which is odd and a violation > > > > of separation of concerns, as well as preventing us from integrating this > > > > and related logic into userland VMA testing going forward, and perhaps more > > > > importantly - enabling us to, in a subsequent commit, make VMA > > > > allocation/freeing a purely internal mm operation. > > > > > > > > There is a fly in the ointment - nommu - mmap.c is not compiled if > > > > CONFIG_MMU is not set, and there is no sensible place to put these outside > > > > of that, so we are put in the position of having to duplication some logic > > > > s/to duplication/to duplicate > > Ack will fix! > > > > > > > here. > > > > > > > > This isn't ideal, but since nommu is a niche use-case, already duplicates a > > > > great deal of mmu logic by its nature and we can eliminate code that is not > > > > applicable to nommu, it seems a worthwhile trade-off. > > > > > > > > The intent is to move all this logic to vma.c in a subsequent commit, > > > > rendering VMA allocation, freeing and duplication mm-internal-only and > > > > userland testable. > > > > > > I'm pretty sure you tried it, but what's the big blocker to have patch > > > #3 first, so we can avoid the temporary move of the code to mmap.c ? > > > > Completely agree with David. > > Ack! Yes this was a little bit of a silly one :P > > > I peeked into 4/4 and it seems you want to keep vma.c completely > > CONFIG_MMU-centric. I know we treat NOMMU as an unwanted child but > > IMHO it would be much cleaner to move these functions into vma.c from > > the beginning and have an #ifdef CONFIG_MMU there like this: > > > > mm/vma.c > > This isn't really workable, as the _entire file_ basically contains > CONFIG_MMU-specific stuff. so it'd be one huge #ifdef CONFIG_MMU block with > one small #else block. It'd also be asking for bugs and issues in nommu. > > I think doing it this way fits the patterns we have established for > nommu/mmap separation, and I would say nommu is enough of a niche edge case > for us to really not want to have to go to great lengths to find ways of > sharing code. > > I am quite concerned about us having to consider it and deal with issues > around it so often, so want to try to avoid that as much as we can, > ideally. I think you're asking for more issues the way you have it now. It could be a very long time until someone sees that nommu isn't working, probably an entire stable kernel cycle. Basically the longest time it can go before being deemed unnecessary to fix. It could also be worse, it could end up like the arch code with bugs over a decade old not being noticed because it was forked off into another file. Could we create another file for the small section of common code and achieve your goals? Thanks, Liam
On Fri, Apr 25, 2025 at 06:26:00AM -0400, Liam R. Howlett wrote: > * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250425 06:09]: > > On Thu, Apr 24, 2025 at 06:22:30PM -0700, Suren Baghdasaryan wrote: > > > On Thu, Apr 24, 2025 at 2:22 PM David Hildenbrand <david@redhat.com> wrote: > > > > > > > > On 24.04.25 23:15, Lorenzo Stoakes wrote: > > > > > Right now these are performed in kernel/fork.c which is odd and a violation > > > > > of separation of concerns, as well as preventing us from integrating this > > > > > and related logic into userland VMA testing going forward, and perhaps more > > > > > importantly - enabling us to, in a subsequent commit, make VMA > > > > > allocation/freeing a purely internal mm operation. > > > > > > > > > > There is a fly in the ointment - nommu - mmap.c is not compiled if > > > > > CONFIG_MMU is not set, and there is no sensible place to put these outside > > > > > of that, so we are put in the position of having to duplication some logic > > > > > > s/to duplication/to duplicate > > > > Ack will fix! > > > > > > > > > > here. > > > > > > > > > > This isn't ideal, but since nommu is a niche use-case, already duplicates a > > > > > great deal of mmu logic by its nature and we can eliminate code that is not > > > > > applicable to nommu, it seems a worthwhile trade-off. > > > > > > > > > > The intent is to move all this logic to vma.c in a subsequent commit, > > > > > rendering VMA allocation, freeing and duplication mm-internal-only and > > > > > userland testable. > > > > > > > > I'm pretty sure you tried it, but what's the big blocker to have patch > > > > #3 first, so we can avoid the temporary move of the code to mmap.c ? > > > > > > Completely agree with David. > > > > Ack! Yes this was a little bit of a silly one :P > > > > > I peeked into 4/4 and it seems you want to keep vma.c completely > > > CONFIG_MMU-centric. I know we treat NOMMU as an unwanted child but > > > IMHO it would be much cleaner to move these functions into vma.c from > > > the beginning and have an #ifdef CONFIG_MMU there like this: > > > > > > mm/vma.c > > > > This isn't really workable, as the _entire file_ basically contains > > CONFIG_MMU-specific stuff. so it'd be one huge #ifdef CONFIG_MMU block with > > one small #else block. It'd also be asking for bugs and issues in nommu. > > > > I think doing it this way fits the patterns we have established for > > nommu/mmap separation, and I would say nommu is enough of a niche edge case > > for us to really not want to have to go to great lengths to find ways of > > sharing code. > > > > I am quite concerned about us having to consider it and deal with issues > > around it so often, so want to try to avoid that as much as we can, > > ideally. > > I think you're asking for more issues the way you have it now. It could > be a very long time until someone sees that nommu isn't working, > probably an entire stable kernel cycle. Basically the longest time it > can go before being deemed unnecessary to fix. > > It could also be worse, it could end up like the arch code with bugs > over a decade old not being noticed because it was forked off into > another file. > > Could we create another file for the small section of common code and > achieve your goals? That'd completely defeat the purpose of isolating core functions to vma.c. Again, I don't believe that bending over backwards to support this niche use is appropriate. And if you're making a change to vm_area_alloc(), vm_area_free(), vm_area_init_from(), vm_area_dup() it'd seem like an oversight not to check nommu right? There's already a large amount of duplicated logic there specific to nommu for which precisely the same could be said, including entirely parallel brk(), mmap() implementations. So this isn't a change in how we handle nommu. > > Thanks, > Liam > >
On Fri, Apr 25, 2025 at 11:31:12AM +0100, Lorenzo Stoakes wrote: > On Fri, Apr 25, 2025 at 06:26:00AM -0400, Liam R. Howlett wrote: > > * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250425 06:09]: > > > On Thu, Apr 24, 2025 at 06:22:30PM -0700, Suren Baghdasaryan wrote: > > > > On Thu, Apr 24, 2025 at 2:22 PM David Hildenbrand <david@redhat.com> wrote: > > > > > > > > > > On 24.04.25 23:15, Lorenzo Stoakes wrote: > > > > > > Right now these are performed in kernel/fork.c which is odd and a violation > > > > > > of separation of concerns, as well as preventing us from integrating this > > > > > > and related logic into userland VMA testing going forward, and perhaps more > > > > > > importantly - enabling us to, in a subsequent commit, make VMA > > > > > > allocation/freeing a purely internal mm operation. > > > > > > > > > > > > There is a fly in the ointment - nommu - mmap.c is not compiled if > > > > > > CONFIG_MMU is not set, and there is no sensible place to put these outside > > > > > > of that, so we are put in the position of having to duplication some logic > > > > > > > > s/to duplication/to duplicate > > > > > > Ack will fix! > > > > > > > > > > > > > here. > > > > > > > > > > > > This isn't ideal, but since nommu is a niche use-case, already duplicates a > > > > > > great deal of mmu logic by its nature and we can eliminate code that is not > > > > > > applicable to nommu, it seems a worthwhile trade-off. > > > > > > > > > > > > The intent is to move all this logic to vma.c in a subsequent commit, > > > > > > rendering VMA allocation, freeing and duplication mm-internal-only and > > > > > > userland testable. > > > > > > > > > > I'm pretty sure you tried it, but what's the big blocker to have patch > > > > > #3 first, so we can avoid the temporary move of the code to mmap.c ? > > > > > > > > Completely agree with David. > > > > > > Ack! Yes this was a little bit of a silly one :P > > > > > > > I peeked into 4/4 and it seems you want to keep vma.c completely > > > > CONFIG_MMU-centric. I know we treat NOMMU as an unwanted child but > > > > IMHO it would be much cleaner to move these functions into vma.c from > > > > the beginning and have an #ifdef CONFIG_MMU there like this: > > > > > > > > mm/vma.c > > > > > > This isn't really workable, as the _entire file_ basically contains > > > CONFIG_MMU-specific stuff. so it'd be one huge #ifdef CONFIG_MMU block with > > > one small #else block. It'd also be asking for bugs and issues in nommu. > > > > > > I think doing it this way fits the patterns we have established for > > > nommu/mmap separation, and I would say nommu is enough of a niche edge case > > > for us to really not want to have to go to great lengths to find ways of > > > sharing code. > > > > > > I am quite concerned about us having to consider it and deal with issues > > > around it so often, so want to try to avoid that as much as we can, > > > ideally. > > > > I think you're asking for more issues the way you have it now. It could > > be a very long time until someone sees that nommu isn't working, > > probably an entire stable kernel cycle. Basically the longest time it > > can go before being deemed unnecessary to fix. > > > > It could also be worse, it could end up like the arch code with bugs > > over a decade old not being noticed because it was forked off into > > another file. > > > > Could we create another file for the small section of common code and > > achieve your goals? > > That'd completely defeat the purpose of isolating core functions to vma.c. > > Again, I don't believe that bending over backwards to support this niche > use is appropriate. > > And if you're making a change to vm_area_alloc(), vm_area_free(), > vm_area_init_from(), vm_area_dup() it'd seem like an oversight not to check > nommu right? > > There's already a large amount of duplicated logic there specific to nommu > for which precisely the same could be said, including entirely parallel > brk(), mmap() implementations. > > So this isn't a change in how we handle nommu. I guess an alternative is to introduce a new vma_shared.c, vma_shared.h pair of files here, that we try to allow userland isolation for so vma.c can still use for userland testing. This then aligns with your requirement, and keeps it vma-centric like Suren's suggestion. Or perhaps it could even be vma_init.c, vma_init.h? To denote that it references the initialisation and allocation, etc. of VMAs? Anyway we do that, we share it across all, and it solves all problems... gives us the isolation for userland testing and also isolation in mm, while also ensuring no code duplication with nommu. That work? > > > > > Thanks, > > Liam > > > >
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250425 06:45]: ... > > > > > > > > I think doing it this way fits the patterns we have established for > > > > nommu/mmap separation, and I would say nommu is enough of a niche edge case > > > > for us to really not want to have to go to great lengths to find ways of > > > > sharing code. > > > > > > > > I am quite concerned about us having to consider it and deal with issues > > > > around it so often, so want to try to avoid that as much as we can, > > > > ideally. > > > > > > I think you're asking for more issues the way you have it now. It could > > > be a very long time until someone sees that nommu isn't working, > > > probably an entire stable kernel cycle. Basically the longest time it > > > can go before being deemed unnecessary to fix. > > > > > > It could also be worse, it could end up like the arch code with bugs > > > over a decade old not being noticed because it was forked off into > > > another file. > > > > > > Could we create another file for the small section of common code and > > > achieve your goals? > > > > That'd completely defeat the purpose of isolating core functions to vma.c. > > > > Again, I don't believe that bending over backwards to support this niche > > use is appropriate. > > > > And if you're making a change to vm_area_alloc(), vm_area_free(), > > vm_area_init_from(), vm_area_dup() it'd seem like an oversight not to check > > nommu right? > > > > There's already a large amount of duplicated logic there specific to nommu > > for which precisely the same could be said, including entirely parallel > > brk(), mmap() implementations. > > > > So this isn't a change in how we handle nommu. > > I guess an alternative is to introduce a new vma_shared.c, vma_shared.h > pair of files here, that we try to allow userland isolation for so vma.c > can still use for userland testing. > > This then aligns with your requirement, and keeps it vma-centric like > Suren's suggestion. > > Or perhaps it could even be vma_init.c, vma_init.h? To denote that it > references the initialisation and allocation, etc. of VMAs? Sure, the name isn't as important as the concept, but I like vma_init better than vma_shared. > > Anyway we do that, we share it across all, and it solves all > problems... gives us the isolation for userland testing and also isolation > in mm, while also ensuring no code duplication with nommu. > > That work? Yes, this is what I was suggesting. I really think this is the least painful way to deal with nommu. Otherwise we will waste more time later trying to fix what was overlooked. Thanks, Liam
On Fri, Apr 25, 2025 at 07:00:27AM -0400, Liam R. Howlett wrote: > * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250425 06:45]: > ... > > > > > > > > > > > I think doing it this way fits the patterns we have established for > > > > > nommu/mmap separation, and I would say nommu is enough of a niche edge case > > > > > for us to really not want to have to go to great lengths to find ways of > > > > > sharing code. > > > > > > > > > > I am quite concerned about us having to consider it and deal with issues > > > > > around it so often, so want to try to avoid that as much as we can, > > > > > ideally. > > > > > > > > I think you're asking for more issues the way you have it now. It could > > > > be a very long time until someone sees that nommu isn't working, > > > > probably an entire stable kernel cycle. Basically the longest time it > > > > can go before being deemed unnecessary to fix. > > > > > > > > It could also be worse, it could end up like the arch code with bugs > > > > over a decade old not being noticed because it was forked off into > > > > another file. > > > > > > > > Could we create another file for the small section of common code and > > > > achieve your goals? > > > > > > That'd completely defeat the purpose of isolating core functions to vma.c. > > > > > > Again, I don't believe that bending over backwards to support this niche > > > use is appropriate. > > > > > > And if you're making a change to vm_area_alloc(), vm_area_free(), > > > vm_area_init_from(), vm_area_dup() it'd seem like an oversight not to check > > > nommu right? > > > > > > There's already a large amount of duplicated logic there specific to nommu > > > for which precisely the same could be said, including entirely parallel > > > brk(), mmap() implementations. > > > > > > So this isn't a change in how we handle nommu. > > > > I guess an alternative is to introduce a new vma_shared.c, vma_shared.h > > pair of files here, that we try to allow userland isolation for so vma.c > > can still use for userland testing. > > > > This then aligns with your requirement, and keeps it vma-centric like > > Suren's suggestion. > > > > Or perhaps it could even be vma_init.c, vma_init.h? To denote that it > > references the initialisation and allocation, etc. of VMAs? > > Sure, the name isn't as important as the concept, but I like vma_init > better than vma_shared. > > > > > Anyway we do that, we share it across all, and it solves all > > problems... gives us the isolation for userland testing and also isolation > > in mm, while also ensuring no code duplication with nommu. > > > > That work? > > Yes, this is what I was suggesting. Ack, I wondered if you meant placing it in some existing shared file such as mm/util.c hence objecting, but this works better :) > > I really think this is the least painful way to deal with nommu. > Otherwise we will waste more time later trying to fix what was > overlooked. Sure, I definitely understand the motivation. I resent that we have to think about this, but we do have to live with it, so agreed this is the best approach. > > Thanks, > Liam > Will rework series to do this and send a v2. Cheers, Lorenzo
On Thu, Apr 24, 2025 at 6:22 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Thu, Apr 24, 2025 at 2:22 PM David Hildenbrand <david@redhat.com> wrote:
> >
> > On 24.04.25 23:15, Lorenzo Stoakes wrote:
> > > Right now these are performed in kernel/fork.c which is odd and a violation
> > > of separation of concerns, as well as preventing us from integrating this
> > > and related logic into userland VMA testing going forward, and perhaps more
> > > importantly - enabling us to, in a subsequent commit, make VMA
> > > allocation/freeing a purely internal mm operation.
> > >
> > > There is a fly in the ointment - nommu - mmap.c is not compiled if
> > > CONFIG_MMU is not set, and there is no sensible place to put these outside
> > > of that, so we are put in the position of having to duplication some logic
>
> s/to duplication/to duplicate
>
> > > here.
> > >
> > > This isn't ideal, but since nommu is a niche use-case, already duplicates a
> > > great deal of mmu logic by its nature and we can eliminate code that is not
> > > applicable to nommu, it seems a worthwhile trade-off.
> > >
> > > The intent is to move all this logic to vma.c in a subsequent commit,
> > > rendering VMA allocation, freeing and duplication mm-internal-only and
> > > userland testable.
> >
> > I'm pretty sure you tried it, but what's the big blocker to have patch
> > #3 first, so we can avoid the temporary move of the code to mmap.c ?
>
> Completely agree with David.
> I peeked into 4/4 and it seems you want to keep vma.c completely
> CONFIG_MMU-centric. I know we treat NOMMU as an unwanted child but
> IMHO it would be much cleaner to move these functions into vma.c from
> the beginning and have an #ifdef CONFIG_MMU there like this:
>
> mm/vma.c
>
> /* Functions identical for MMU/NOMMU */
> struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) {...}
> void __init vma_state_init(void) {...}
>
> #ifdef CONFIG_MMU
> static void vm_area_init_from(const struct vm_area_struct *src,
> struct vm_area_struct *dest) {...}
> struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) {...}
> void vm_area_free(struct vm_area_struct *vma) {...}
> #else /* CONFIG_MMU */
> static void vm_area_init_from(const struct vm_area_struct *src,
> struct vm_area_struct *dest) {...}
> struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) {...}
> void vm_area_free(struct vm_area_struct *vma) {...}
> #endif /* CONFIG_MMU */
3/4 and 4/4 look reasonable but they can change substantially
depending on your answer to my suggestion above, so I'll wait for your
answer before moving forward.
Thanks for doing this!
Suren.
>
>
>
>
>
> >
> > --
> > Cheers,
> >
> > David / dhildenb
> >
On Thu, Apr 24, 2025 at 06:37:39PM -0700, Suren Baghdasaryan wrote:
> On Thu, Apr 24, 2025 at 6:22 PM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Thu, Apr 24, 2025 at 2:22 PM David Hildenbrand <david@redhat.com> wrote:
> > >
> > > On 24.04.25 23:15, Lorenzo Stoakes wrote:
> > > > Right now these are performed in kernel/fork.c which is odd and a violation
> > > > of separation of concerns, as well as preventing us from integrating this
> > > > and related logic into userland VMA testing going forward, and perhaps more
> > > > importantly - enabling us to, in a subsequent commit, make VMA
> > > > allocation/freeing a purely internal mm operation.
> > > >
> > > > There is a fly in the ointment - nommu - mmap.c is not compiled if
> > > > CONFIG_MMU is not set, and there is no sensible place to put these outside
> > > > of that, so we are put in the position of having to duplication some logic
> >
> > s/to duplication/to duplicate
> >
> > > > here.
> > > >
> > > > This isn't ideal, but since nommu is a niche use-case, already duplicates a
> > > > great deal of mmu logic by its nature and we can eliminate code that is not
> > > > applicable to nommu, it seems a worthwhile trade-off.
> > > >
> > > > The intent is to move all this logic to vma.c in a subsequent commit,
> > > > rendering VMA allocation, freeing and duplication mm-internal-only and
> > > > userland testable.
> > >
> > > I'm pretty sure you tried it, but what's the big blocker to have patch
> > > #3 first, so we can avoid the temporary move of the code to mmap.c ?
> >
> > Completely agree with David.
> > I peeked into 4/4 and it seems you want to keep vma.c completely
> > CONFIG_MMU-centric. I know we treat NOMMU as an unwanted child but
> > IMHO it would be much cleaner to move these functions into vma.c from
> > the beginning and have an #ifdef CONFIG_MMU there like this:
> >
> > mm/vma.c
> >
> > /* Functions identical for MMU/NOMMU */
> > struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) {...}
> > void __init vma_state_init(void) {...}
> >
> > #ifdef CONFIG_MMU
> > static void vm_area_init_from(const struct vm_area_struct *src,
> > struct vm_area_struct *dest) {...}
> > struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) {...}
> > void vm_area_free(struct vm_area_struct *vma) {...}
> > #else /* CONFIG_MMU */
> > static void vm_area_init_from(const struct vm_area_struct *src,
> > struct vm_area_struct *dest) {...}
> > struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) {...}
> > void vm_area_free(struct vm_area_struct *vma) {...}
> > #endif /* CONFIG_MMU */
>
> 3/4 and 4/4 look reasonable but they can change substantially
> depending on your answer to my suggestion above, so I'll wait for your
> answer before moving forward.
> Thanks for doing this!
> Suren.
You're welcome :)
Well I will be fixing the issue David raised of course :) but as stated in
previous email, I don't feel it makes sense to put nommu stuff in vma.c really.
>
> >
> >
> >
> >
> >
> > >
> > > --
> > > Cheers,
> > >
> > > David / dhildenb
> > >
On Fri, Apr 25, 2025 at 11:10:00AM +0100, Lorenzo Stoakes wrote:
> On Thu, Apr 24, 2025 at 06:37:39PM -0700, Suren Baghdasaryan wrote:
> > On Thu, Apr 24, 2025 at 6:22 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Thu, Apr 24, 2025 at 2:22 PM David Hildenbrand <david@redhat.com> wrote:
> > > >
> > > > On 24.04.25 23:15, Lorenzo Stoakes wrote:
> > > > > Right now these are performed in kernel/fork.c which is odd and a violation
> > > > > of separation of concerns, as well as preventing us from integrating this
> > > > > and related logic into userland VMA testing going forward, and perhaps more
> > > > > importantly - enabling us to, in a subsequent commit, make VMA
> > > > > allocation/freeing a purely internal mm operation.
> > > > >
> > > > > There is a fly in the ointment - nommu - mmap.c is not compiled if
> > > > > CONFIG_MMU is not set, and there is no sensible place to put these outside
> > > > > of that, so we are put in the position of having to duplication some logic
> > >
> > > s/to duplication/to duplicate
> > >
> > > > > here.
> > > > >
> > > > > This isn't ideal, but since nommu is a niche use-case, already duplicates a
> > > > > great deal of mmu logic by its nature and we can eliminate code that is not
> > > > > applicable to nommu, it seems a worthwhile trade-off.
> > > > >
> > > > > The intent is to move all this logic to vma.c in a subsequent commit,
> > > > > rendering VMA allocation, freeing and duplication mm-internal-only and
> > > > > userland testable.
> > > >
> > > > I'm pretty sure you tried it, but what's the big blocker to have patch
> > > > #3 first, so we can avoid the temporary move of the code to mmap.c ?
> > >
> > > Completely agree with David.
> > > I peeked into 4/4 and it seems you want to keep vma.c completely
> > > CONFIG_MMU-centric. I know we treat NOMMU as an unwanted child but
> > > IMHO it would be much cleaner to move these functions into vma.c from
> > > the beginning and have an #ifdef CONFIG_MMU there like this:
> > >
> > > mm/vma.c
> > >
> > > /* Functions identical for MMU/NOMMU */
> > > struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) {...}
> > > void __init vma_state_init(void) {...}
> > >
> > > #ifdef CONFIG_MMU
> > > static void vm_area_init_from(const struct vm_area_struct *src,
> > > struct vm_area_struct *dest) {...}
> > > struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) {...}
> > > void vm_area_free(struct vm_area_struct *vma) {...}
> > > #else /* CONFIG_MMU */
> > > static void vm_area_init_from(const struct vm_area_struct *src,
> > > struct vm_area_struct *dest) {...}
> > > struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) {...}
> > > void vm_area_free(struct vm_area_struct *vma) {...}
> > > #endif /* CONFIG_MMU */
> >
> > 3/4 and 4/4 look reasonable but they can change substantially
> > depending on your answer to my suggestion above, so I'll wait for your
> > answer before moving forward.
> > Thanks for doing this!
> > Suren.
>
> You're welcome :)
>
> Well I will be fixing the issue David raised of course :) but as stated in
> previous email, I don't feel it makes sense to put nommu stuff in vma.c really.
UPDATE: As per discussions with Liam, will be going ahead with an alternative
but equivalent approach.
Thanks to both of you for your suggestions on this!
Cheers, Lorenzo
>
> >
> > >
> > >
> > >
> > >
> > >
> > > >
> > > > --
> > > > Cheers,
> > > >
> > > > David / dhildenb
> > > >
© 2016 - 2026 Red Hat, Inc.