There are peculiarities within the kernel where what is very clearly mm
code is performed elsewhere arbitrarily.
This violates separation of concerns and makes it harder to refactor code
to make changes to how fundamental initialisation and operation of mm logic
is performed.
One such case is the creation of the VMA containing the initial stack upon
execve()'ing a new process. This is currently performed in __bprm_mm_init()
in fs/exec.c.
Abstract this operation to create_init_stack_vma(). This allows us to limit
use of vma allocation and free code to fork and mm only.
We previously did the same for the step at which we relocate the initial
stack VMA downwards via relocate_vma_down(), now we move the initial VMA
establishment too.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
fs/exec.c | 51 +--------------------------------
include/linux/mm.h | 2 ++
mm/mmap.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 74 insertions(+), 50 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c
index 8e4ea5f1e64c..ef34a68ef825 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -244,56 +244,7 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
static int __bprm_mm_init(struct linux_binprm *bprm)
{
- int err;
- struct vm_area_struct *vma = NULL;
- struct mm_struct *mm = bprm->mm;
-
- bprm->vma = vma = vm_area_alloc(mm);
- if (!vma)
- return -ENOMEM;
- vma_set_anonymous(vma);
-
- if (mmap_write_lock_killable(mm)) {
- err = -EINTR;
- goto err_free;
- }
-
- /*
- * Need to be called with mmap write lock
- * held, to avoid race with ksmd.
- */
- err = ksm_execve(mm);
- if (err)
- goto err_ksm;
-
- /*
- * Place the stack at the largest stack address the architecture
- * supports. Later, we'll move this to an appropriate place. We don't
- * use STACK_TOP because that can depend on attributes which aren't
- * configured yet.
- */
- BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
- vma->vm_end = STACK_TOP_MAX;
- vma->vm_start = vma->vm_end - PAGE_SIZE;
- vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
- vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
-
- err = insert_vm_struct(mm, vma);
- if (err)
- goto err;
-
- mm->stack_vm = mm->total_vm = 1;
- mmap_write_unlock(mm);
- bprm->p = vma->vm_end - sizeof(void *);
- return 0;
-err:
- ksm_exit(mm);
-err_ksm:
- mmap_write_unlock(mm);
-err_free:
- bprm->vma = NULL;
- vm_area_free(vma);
- return err;
+ return create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p);
}
static bool valid_arg_len(struct linux_binprm *bprm, long len)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9b701cfbef22..fa84e59a99bb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3223,6 +3223,8 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void exit_mmap(struct mm_struct *);
+int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
+ unsigned long *top_mem_p);
int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, bool write);
diff --git a/mm/mmap.c b/mm/mmap.c
index bd210aaf7ebd..1289c6381419 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1717,6 +1717,77 @@ static int __meminit init_reserve_notifier(void)
}
subsys_initcall(init_reserve_notifier);
+/*
+ * Establish the stack VMA in an execve'd process, located temporarily at the
+ * maximum stack address provided by the architecture.
+ *
+ * We later relocate this downwards in relocate_vma_down().
+ *
+ * This function is almost certainly NOT what you want for anything other than
+ * early executable initialisation.
+ *
+ * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the
+ * maximum addressable location in the stack (that is capable of storing a
+ * system word of data).
+ *
+ * on failure, returns an error code.
+ */
+int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,
+ unsigned long *top_mem_p)
+{
+ int err;
+ struct vm_area_struct *vma = vm_area_alloc(mm);
+
+ if (!vma)
+ return -ENOMEM;
+
+ vma_set_anonymous(vma);
+
+ if (mmap_write_lock_killable(mm)) {
+ err = -EINTR;
+ goto err_free;
+ }
+
+ /*
+ * Need to be called with mmap write lock
+ * held, to avoid race with ksmd.
+ */
+ err = ksm_execve(mm);
+ if (err)
+ goto err_ksm;
+
+ /*
+ * Place the stack at the largest stack address the architecture
+ * supports. Later, we'll move this to an appropriate place. We don't
+ * use STACK_TOP because that can depend on attributes which aren't
+ * configured yet.
+ */
+ BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
+ vma->vm_end = STACK_TOP_MAX;
+ vma->vm_start = vma->vm_end - PAGE_SIZE;
+ vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
+ vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
+
+ err = insert_vm_struct(mm, vma);
+ if (err)
+ goto err;
+
+ mm->stack_vm = mm->total_vm = 1;
+ mmap_write_unlock(mm);
+ *vmap = vma;
+ *top_mem_p = vma->vm_end - sizeof(void *);
+ return 0;
+
+err:
+ ksm_exit(mm);
+err_ksm:
+ mmap_write_unlock(mm);
+err_free:
+ *vmap = NULL;
+ vm_area_free(vma);
+ return err;
+}
+
/*
* Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
* this VMA and its relocated range, which will now reside at [vma->vm_start -
--
2.49.0
On 24.04.25 23:15, Lorenzo Stoakes wrote: > There are peculiarities within the kernel where what is very clearly mm > code is performed elsewhere arbitrarily. > > This violates separation of concerns and makes it harder to refactor code > to make changes to how fundamental initialisation and operation of mm logic > is performed. > > One such case is the creation of the VMA containing the initial stack upon > execve()'ing a new process. This is currently performed in __bprm_mm_init() > in fs/exec.c. > > Abstract this operation to create_init_stack_vma(). This allows us to limit > use of vma allocation and free code to fork and mm only. > > We previously did the same for the step at which we relocate the initial > stack VMA downwards via relocate_vma_down(), now we move the initial VMA > establishment too. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > --- ... > +/* > + * Establish the stack VMA in an execve'd process, located temporarily at the > + * maximum stack address provided by the architecture. > + * > + * We later relocate this downwards in relocate_vma_down(). > + * > + * This function is almost certainly NOT what you want for anything other than > + * early executable initialisation. > + * > + * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the > + * maximum addressable location in the stack (that is capable of storing a > + * system word of data). > + * > + * on failure, returns an error code. > + */ I was about to say, if you already write that much documentation, why not turn it into kerneldoc? :) But this function is clearly not intended to have more than one caller, so ... :) Acked-by: David Hildenbrand <david@redhat.com> -- Cheers, David / dhildenb
On Thu, Apr 24, 2025 at 11:30:35PM +0200, David Hildenbrand wrote: > On 24.04.25 23:15, Lorenzo Stoakes wrote: > > There are peculiarities within the kernel where what is very clearly mm > > code is performed elsewhere arbitrarily. > > > > This violates separation of concerns and makes it harder to refactor code > > to make changes to how fundamental initialisation and operation of mm logic > > is performed. > > > > One such case is the creation of the VMA containing the initial stack upon > > execve()'ing a new process. This is currently performed in __bprm_mm_init() > > in fs/exec.c. > > > > Abstract this operation to create_init_stack_vma(). This allows us to limit > > use of vma allocation and free code to fork and mm only. > > > > We previously did the same for the step at which we relocate the initial > > stack VMA downwards via relocate_vma_down(), now we move the initial VMA > > establishment too. > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > --- > ... > > > +/* > > + * Establish the stack VMA in an execve'd process, located temporarily at the > > + * maximum stack address provided by the architecture. > > + * > > + * We later relocate this downwards in relocate_vma_down(). > > + * > > + * This function is almost certainly NOT what you want for anything other than > > + * early executable initialisation. > > + * > > + * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the > > + * maximum addressable location in the stack (that is capable of storing a > > + * system word of data). > > + * > > + * on failure, returns an error code. > > + */ > > I was about to say, if you already write that much documentation, why not > turn it into kerneldoc? :) But this function is clearly not intended to have > more than one caller, so ... :) Haha yeah, I felt for this case it's probably not necessary, bit of a blurry line on this but as a one-off thing probably ok :P > > Acked-by: David Hildenbrand <david@redhat.com> Thanks! Sorry I forgot to say thanks also to Suren for his tag in other email, so will say here - also thanks Suren :) > > -- > Cheers, > > David / dhildenb >
On Thu, Apr 24, 2025 at 2:30 PM David Hildenbrand <david@redhat.com> wrote: > > On 24.04.25 23:15, Lorenzo Stoakes wrote: > > There are peculiarities within the kernel where what is very clearly mm > > code is performed elsewhere arbitrarily. > > > > This violates separation of concerns and makes it harder to refactor code > > to make changes to how fundamental initialisation and operation of mm logic > > is performed. > > > > One such case is the creation of the VMA containing the initial stack upon > > execve()'ing a new process. This is currently performed in __bprm_mm_init() > > in fs/exec.c. > > > > Abstract this operation to create_init_stack_vma(). This allows us to limit > > use of vma allocation and free code to fork and mm only. > > > > We previously did the same for the step at which we relocate the initial > > stack VMA downwards via relocate_vma_down(), now we move the initial VMA > > establishment too. > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > --- > ... > > > +/* > > + * Establish the stack VMA in an execve'd process, located temporarily at the > > + * maximum stack address provided by the architecture. > > + * > > + * We later relocate this downwards in relocate_vma_down(). > > + * > > + * This function is almost certainly NOT what you want for anything other than > > + * early executable initialisation. > > + * > > + * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the > > + * maximum addressable location in the stack (that is capable of storing a > > + * system word of data). > > + * > > + * on failure, returns an error code. nit: s/on/On You could also skip this sentence altogether since it's kinda obvious but up to you. > > + */ > > I was about to say, if you already write that much documentation, why > not turn it into kerneldoc? :) But this function is clearly not intended > to have more than one caller, so ... :) > > Acked-by: David Hildenbrand <david@redhat.com> Reviewed-by: Suren Baghdasaryan <surenb@google.com> > > -- > Cheers, > > David / dhildenb >
On Thu, Apr 24, 2025 at 05:55:20PM -0700, Suren Baghdasaryan wrote: > On Thu, Apr 24, 2025 at 2:30 PM David Hildenbrand <david@redhat.com> wrote: > > > > On 24.04.25 23:15, Lorenzo Stoakes wrote: > > > There are peculiarities within the kernel where what is very clearly mm > > > code is performed elsewhere arbitrarily. > > > > > > This violates separation of concerns and makes it harder to refactor code > > > to make changes to how fundamental initialisation and operation of mm logic > > > is performed. > > > > > > One such case is the creation of the VMA containing the initial stack upon > > > execve()'ing a new process. This is currently performed in __bprm_mm_init() > > > in fs/exec.c. > > > > > > Abstract this operation to create_init_stack_vma(). This allows us to limit > > > use of vma allocation and free code to fork and mm only. > > > > > > We previously did the same for the step at which we relocate the initial > > > stack VMA downwards via relocate_vma_down(), now we move the initial VMA > > > establishment too. > > > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > > --- > > ... > > > > > +/* > > > + * Establish the stack VMA in an execve'd process, located temporarily at the > > > + * maximum stack address provided by the architecture. > > > + * > > > + * We later relocate this downwards in relocate_vma_down(). > > > + * > > > + * This function is almost certainly NOT what you want for anything other than > > > + * early executable initialisation. > > > + * > > > + * On success, returns 0 and sets *vmap to the stack VMA and *top_mem_p to the > > > + * maximum addressable location in the stack (that is capable of storing a > > > + * system word of data). > > > + * > > > + * on failure, returns an error code. > > nit: s/on/On > You could also skip this sentence altogether since it's kinda obvious > but up to you. Ack, and yeah probably best to just drop tbh :) > > > > + */ > > > > I was about to say, if you already write that much documentation, why > > not turn it into kerneldoc? :) But this function is clearly not intended > > to have more than one caller, so ... :) > > > > Acked-by: David Hildenbrand <david@redhat.com> > > Reviewed-by: Suren Baghdasaryan <surenb@google.com> > > > > > -- > > Cheers, > > > > David / dhildenb > >
© 2016 - 2026 Red Hat, Inc.