There is functionality that overlaps the exec and memory mapping
subsystems. While it properly belongs in mm, it is important that exec
maintainers maintain oversight of this functionality correctly.
We can establish both goals by adding a new mm/vma_exec.c file which
contains these 'glue' functions, and have fs/exec.c import them.
As a part of this change, to ensure that proper oversight is achieved, add
the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections.
scripts/get_maintainer.pl can correctly handle files in multiple entries
and this neatly handles the cross-over.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
MAINTAINERS | 2 +
fs/exec.c | 3 ++
include/linux/mm.h | 1 -
mm/Makefile | 2 +-
mm/mmap.c | 83 ----------------------------
mm/vma.h | 5 ++
mm/vma_exec.c | 92 ++++++++++++++++++++++++++++++++
tools/testing/vma/Makefile | 2 +-
tools/testing/vma/vma.c | 1 +
tools/testing/vma/vma_internal.h | 40 ++++++++++++++
10 files changed, 145 insertions(+), 86 deletions(-)
create mode 100644 mm/vma_exec.c
diff --git a/MAINTAINERS b/MAINTAINERS
index f5ee0390cdee..1ee1c22e6e36 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8830,6 +8830,7 @@ F: include/linux/elf.h
F: include/uapi/linux/auxvec.h
F: include/uapi/linux/binfmts.h
F: include/uapi/linux/elf.h
+F: mm/vma_exec.c
F: tools/testing/selftests/exec/
N: asm/elf.h
N: binfmt
@@ -15654,6 +15655,7 @@ F: mm/mremap.c
F: mm/mseal.c
F: mm/vma.c
F: mm/vma.h
+F: mm/vma_exec.c
F: mm/vma_internal.h
F: tools/testing/selftests/mm/merge.c
F: tools/testing/vma/
diff --git a/fs/exec.c b/fs/exec.c
index 8e4ea5f1e64c..477bc3f2e966 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -78,6 +78,9 @@
#include <trace/events/sched.h>
+/* For vma exec functions. */
+#include "../mm/internal.h"
+
static int bprm_creds_from_file(struct linux_binprm *bprm);
int suid_dumpable = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 21dd110b6655..4fc361df9ad7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3223,7 +3223,6 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void exit_mmap(struct mm_struct *);
-int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, bool write);
diff --git a/mm/Makefile b/mm/Makefile
index 9d7e5b5bb694..15a901bb431a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -37,7 +37,7 @@ mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
- pgtable-generic.o rmap.o vmalloc.o vma.o
+ pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o
ifdef CONFIG_CROSS_MEMORY_ATTACH
diff --git a/mm/mmap.c b/mm/mmap.c
index bd210aaf7ebd..1794bf6f4dc0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1717,89 +1717,6 @@ static int __meminit init_reserve_notifier(void)
}
subsys_initcall(init_reserve_notifier);
-/*
- * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
- * this VMA and its relocated range, which will now reside at [vma->vm_start -
- * shift, vma->vm_end - shift).
- *
- * This function is almost certainly NOT what you want for anything other than
- * early executable temporary stack relocation.
- */
-int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
-{
- /*
- * The process proceeds as follows:
- *
- * 1) Use shift to calculate the new vma endpoints.
- * 2) Extend vma to cover both the old and new ranges. This ensures the
- * arguments passed to subsequent functions are consistent.
- * 3) Move vma's page tables to the new range.
- * 4) Free up any cleared pgd range.
- * 5) Shrink the vma to cover only the new range.
- */
-
- struct mm_struct *mm = vma->vm_mm;
- unsigned long old_start = vma->vm_start;
- unsigned long old_end = vma->vm_end;
- unsigned long length = old_end - old_start;
- unsigned long new_start = old_start - shift;
- unsigned long new_end = old_end - shift;
- VMA_ITERATOR(vmi, mm, new_start);
- VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
- struct vm_area_struct *next;
- struct mmu_gather tlb;
- PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
-
- BUG_ON(new_start > new_end);
-
- /*
- * ensure there are no vmas between where we want to go
- * and where we are
- */
- if (vma != vma_next(&vmi))
- return -EFAULT;
-
- vma_iter_prev_range(&vmi);
- /*
- * cover the whole range: [new_start, old_end)
- */
- vmg.middle = vma;
- if (vma_expand(&vmg))
- return -ENOMEM;
-
- /*
- * move the page tables downwards, on failure we rely on
- * process cleanup to remove whatever mess we made.
- */
- pmc.for_stack = true;
- if (length != move_page_tables(&pmc))
- return -ENOMEM;
-
- tlb_gather_mmu(&tlb, mm);
- next = vma_next(&vmi);
- if (new_end > old_start) {
- /*
- * when the old and new regions overlap clear from new_end.
- */
- free_pgd_range(&tlb, new_end, old_end, new_end,
- next ? next->vm_start : USER_PGTABLES_CEILING);
- } else {
- /*
- * otherwise, clean from old_start; this is done to not touch
- * the address space in [new_end, old_start) some architectures
- * have constraints on va-space that make this illegal (IA64) -
- * for the others its just a little faster.
- */
- free_pgd_range(&tlb, old_start, old_end, new_end,
- next ? next->vm_start : USER_PGTABLES_CEILING);
- }
- tlb_finish_mmu(&tlb);
-
- vma_prev(&vmi);
- /* Shrink the vma to just the new range */
- return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
-}
-
#ifdef CONFIG_MMU
/*
* Obtain a read lock on mm->mmap_lock, if the specified address is below the
diff --git a/mm/vma.h b/mm/vma.h
index 149926e8a6d1..1ce3e18f01b7 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -548,4 +548,9 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address);
int __vm_munmap(unsigned long start, size_t len, bool unlock);
+/* vma_exec.h */
+#ifdef CONFIG_MMU
+int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
+#endif
+
#endif /* __MM_VMA_H */
diff --git a/mm/vma_exec.c b/mm/vma_exec.c
new file mode 100644
index 000000000000..6736ae37f748
--- /dev/null
+++ b/mm/vma_exec.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Functions explicitly implemented for exec functionality which however are
+ * explicitly VMA-only logic.
+ */
+
+#include "vma_internal.h"
+#include "vma.h"
+
+/*
+ * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
+ * this VMA and its relocated range, which will now reside at [vma->vm_start -
+ * shift, vma->vm_end - shift).
+ *
+ * This function is almost certainly NOT what you want for anything other than
+ * early executable temporary stack relocation.
+ */
+int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
+{
+ /*
+ * The process proceeds as follows:
+ *
+ * 1) Use shift to calculate the new vma endpoints.
+ * 2) Extend vma to cover both the old and new ranges. This ensures the
+ * arguments passed to subsequent functions are consistent.
+ * 3) Move vma's page tables to the new range.
+ * 4) Free up any cleared pgd range.
+ * 5) Shrink the vma to cover only the new range.
+ */
+
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long old_start = vma->vm_start;
+ unsigned long old_end = vma->vm_end;
+ unsigned long length = old_end - old_start;
+ unsigned long new_start = old_start - shift;
+ unsigned long new_end = old_end - shift;
+ VMA_ITERATOR(vmi, mm, new_start);
+ VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
+ struct vm_area_struct *next;
+ struct mmu_gather tlb;
+ PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
+
+ BUG_ON(new_start > new_end);
+
+ /*
+ * ensure there are no vmas between where we want to go
+ * and where we are
+ */
+ if (vma != vma_next(&vmi))
+ return -EFAULT;
+
+ vma_iter_prev_range(&vmi);
+ /*
+ * cover the whole range: [new_start, old_end)
+ */
+ vmg.middle = vma;
+ if (vma_expand(&vmg))
+ return -ENOMEM;
+
+ /*
+ * move the page tables downwards, on failure we rely on
+ * process cleanup to remove whatever mess we made.
+ */
+ pmc.for_stack = true;
+ if (length != move_page_tables(&pmc))
+ return -ENOMEM;
+
+ tlb_gather_mmu(&tlb, mm);
+ next = vma_next(&vmi);
+ if (new_end > old_start) {
+ /*
+ * when the old and new regions overlap clear from new_end.
+ */
+ free_pgd_range(&tlb, new_end, old_end, new_end,
+ next ? next->vm_start : USER_PGTABLES_CEILING);
+ } else {
+ /*
+ * otherwise, clean from old_start; this is done to not touch
+ * the address space in [new_end, old_start) some architectures
+ * have constraints on va-space that make this illegal (IA64) -
+ * for the others its just a little faster.
+ */
+ free_pgd_range(&tlb, old_start, old_end, new_end,
+ next ? next->vm_start : USER_PGTABLES_CEILING);
+ }
+ tlb_finish_mmu(&tlb);
+
+ vma_prev(&vmi);
+ /* Shrink the vma to just the new range */
+ return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
+}
diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile
index 860fd2311dcc..624040fcf193 100644
--- a/tools/testing/vma/Makefile
+++ b/tools/testing/vma/Makefile
@@ -9,7 +9,7 @@ include ../shared/shared.mk
OFILES = $(SHARED_OFILES) vma.o maple-shim.o
TARGETS = vma
-vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma.h
+vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_exec.c ../../../mm/vma.h
vma: $(OFILES)
$(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)
diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
index 7cfd6e31db10..5832ae5d797d 100644
--- a/tools/testing/vma/vma.c
+++ b/tools/testing/vma/vma.c
@@ -28,6 +28,7 @@ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
* Directly import the VMA implementation here. Our vma_internal.h wrapper
* provides userland-equivalent functionality for everything vma.c uses.
*/
+#include "../../../mm/vma_exec.c"
#include "../../../mm/vma.c"
const struct vm_operations_struct vma_dummy_vm_ops;
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 572ab2cea763..0df19ca0000a 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -421,6 +421,28 @@ struct vm_unmapped_area_info {
unsigned long start_gap;
};
+struct pagetable_move_control {
+ struct vm_area_struct *old; /* Source VMA. */
+ struct vm_area_struct *new; /* Destination VMA. */
+ unsigned long old_addr; /* Address from which the move begins. */
+ unsigned long old_end; /* Exclusive address at which old range ends. */
+ unsigned long new_addr; /* Address to move page tables to. */
+ unsigned long len_in; /* Bytes to remap specified by user. */
+
+ bool need_rmap_locks; /* Do rmap locks need to be taken? */
+ bool for_stack; /* Is this an early temp stack being moved? */
+};
+
+#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \
+ struct pagetable_move_control name = { \
+ .old = old_, \
+ .new = new_, \
+ .old_addr = old_addr_, \
+ .old_end = (old_addr_) + (len_), \
+ .new_addr = new_addr_, \
+ .len_in = len_, \
+ }
+
static inline void vma_iter_invalidate(struct vma_iterator *vmi)
{
mas_pause(&vmi->mas);
@@ -1240,4 +1262,22 @@ static inline int mapping_map_writable(struct address_space *mapping)
return 0;
}
+static inline unsigned long move_page_tables(struct pagetable_move_control *pmc)
+{
+ (void)pmc;
+
+ return 0;
+}
+
+static inline void free_pgd_range(struct mmu_gather *tlb,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
+{
+ (void)tlb;
+ (void)addr;
+ (void)end;
+ (void)floor;
+ (void)ceiling;
+}
+
#endif /* __MM_VMA_INTERNAL_H */
--
2.49.0
On Mon, Apr 28, 2025 at 04:28:14PM +0100, Lorenzo Stoakes wrote: > There is functionality that overlaps the exec and memory mapping > subsystems. While it properly belongs in mm, it is important that exec > maintainers maintain oversight of this functionality correctly. > > We can establish both goals by adding a new mm/vma_exec.c file which > contains these 'glue' functions, and have fs/exec.c import them. > > As a part of this change, to ensure that proper oversight is achieved, add > the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections. > > scripts/get_maintainer.pl can correctly handle files in multiple entries > and this neatly handles the cross-over. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Reviewed-by: Pedro Falcato <pfalcato@suse.de> -- Pedro
On 28.04.25 17:28, Lorenzo Stoakes wrote: > There is functionality that overlaps the exec and memory mapping > subsystems. While it properly belongs in mm, it is important that exec > maintainers maintain oversight of this functionality correctly. > > We can establish both goals by adding a new mm/vma_exec.c file which > contains these 'glue' functions, and have fs/exec.c import them. > > As a part of this change, to ensure that proper oversight is achieved, add > the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections. > > scripts/get_maintainer.pl can correctly handle files in multiple entries > and this neatly handles the cross-over. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > --- Reviewed-by: David Hildenbrand <david@redhat.com> -- Cheers, David / dhildenb
On Mon, Apr 28, 2025 at 04:28:14PM +0100, Lorenzo Stoakes wrote: > There is functionality that overlaps the exec and memory mapping > subsystems. While it properly belongs in mm, it is important that exec > maintainers maintain oversight of this functionality correctly. > > We can establish both goals by adding a new mm/vma_exec.c file which > contains these 'glue' functions, and have fs/exec.c import them. > > As a part of this change, to ensure that proper oversight is achieved, add > the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections. > > scripts/get_maintainer.pl can correctly handle files in multiple entries > and this neatly handles the cross-over. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> (I realize I didn't actually send tags...) Reviewed-by: Kees Cook <kees@kernel.org> -- Kees Cook
On 4/28/25 17:28, Lorenzo Stoakes wrote: > There is functionality that overlaps the exec and memory mapping > subsystems. While it properly belongs in mm, it is important that exec > maintainers maintain oversight of this functionality correctly. > > We can establish both goals by adding a new mm/vma_exec.c file which > contains these 'glue' functions, and have fs/exec.c import them. > > As a part of this change, to ensure that proper oversight is achieved, add > the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections. > > scripts/get_maintainer.pl can correctly handle files in multiple entries > and this neatly handles the cross-over. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250428 11:28]:
> There is functionality that overlaps the exec and memory mapping
> subsystems. While it properly belongs in mm, it is important that exec
> maintainers maintain oversight of this functionality correctly.
>
> We can establish both goals by adding a new mm/vma_exec.c file which
> contains these 'glue' functions, and have fs/exec.c import them.
>
> As a part of this change, to ensure that proper oversight is achieved, add
> the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections.
>
> scripts/get_maintainer.pl can correctly handle files in multiple entries
> and this neatly handles the cross-over.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
> ---
> MAINTAINERS | 2 +
> fs/exec.c | 3 ++
> include/linux/mm.h | 1 -
> mm/Makefile | 2 +-
> mm/mmap.c | 83 ----------------------------
> mm/vma.h | 5 ++
> mm/vma_exec.c | 92 ++++++++++++++++++++++++++++++++
> tools/testing/vma/Makefile | 2 +-
> tools/testing/vma/vma.c | 1 +
> tools/testing/vma/vma_internal.h | 40 ++++++++++++++
> 10 files changed, 145 insertions(+), 86 deletions(-)
> create mode 100644 mm/vma_exec.c
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index f5ee0390cdee..1ee1c22e6e36 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -8830,6 +8830,7 @@ F: include/linux/elf.h
> F: include/uapi/linux/auxvec.h
> F: include/uapi/linux/binfmts.h
> F: include/uapi/linux/elf.h
> +F: mm/vma_exec.c
> F: tools/testing/selftests/exec/
> N: asm/elf.h
> N: binfmt
> @@ -15654,6 +15655,7 @@ F: mm/mremap.c
> F: mm/mseal.c
> F: mm/vma.c
> F: mm/vma.h
> +F: mm/vma_exec.c
> F: mm/vma_internal.h
> F: tools/testing/selftests/mm/merge.c
> F: tools/testing/vma/
> diff --git a/fs/exec.c b/fs/exec.c
> index 8e4ea5f1e64c..477bc3f2e966 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -78,6 +78,9 @@
>
> #include <trace/events/sched.h>
>
> +/* For vma exec functions. */
> +#include "../mm/internal.h"
> +
> static int bprm_creds_from_file(struct linux_binprm *bprm);
>
> int suid_dumpable = 0;
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 21dd110b6655..4fc361df9ad7 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3223,7 +3223,6 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
> extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
> extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
> extern void exit_mmap(struct mm_struct *);
> -int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
> bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
> unsigned long addr, bool write);
>
> diff --git a/mm/Makefile b/mm/Makefile
> index 9d7e5b5bb694..15a901bb431a 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -37,7 +37,7 @@ mmu-y := nommu.o
> mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
> mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
> msync.o page_vma_mapped.o pagewalk.o \
> - pgtable-generic.o rmap.o vmalloc.o vma.o
> + pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o
>
>
> ifdef CONFIG_CROSS_MEMORY_ATTACH
> diff --git a/mm/mmap.c b/mm/mmap.c
> index bd210aaf7ebd..1794bf6f4dc0 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1717,89 +1717,6 @@ static int __meminit init_reserve_notifier(void)
> }
> subsys_initcall(init_reserve_notifier);
>
> -/*
> - * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
> - * this VMA and its relocated range, which will now reside at [vma->vm_start -
> - * shift, vma->vm_end - shift).
> - *
> - * This function is almost certainly NOT what you want for anything other than
> - * early executable temporary stack relocation.
> - */
> -int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> -{
> - /*
> - * The process proceeds as follows:
> - *
> - * 1) Use shift to calculate the new vma endpoints.
> - * 2) Extend vma to cover both the old and new ranges. This ensures the
> - * arguments passed to subsequent functions are consistent.
> - * 3) Move vma's page tables to the new range.
> - * 4) Free up any cleared pgd range.
> - * 5) Shrink the vma to cover only the new range.
> - */
> -
> - struct mm_struct *mm = vma->vm_mm;
> - unsigned long old_start = vma->vm_start;
> - unsigned long old_end = vma->vm_end;
> - unsigned long length = old_end - old_start;
> - unsigned long new_start = old_start - shift;
> - unsigned long new_end = old_end - shift;
> - VMA_ITERATOR(vmi, mm, new_start);
> - VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
> - struct vm_area_struct *next;
> - struct mmu_gather tlb;
> - PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
> -
> - BUG_ON(new_start > new_end);
> -
> - /*
> - * ensure there are no vmas between where we want to go
> - * and where we are
> - */
> - if (vma != vma_next(&vmi))
> - return -EFAULT;
> -
> - vma_iter_prev_range(&vmi);
> - /*
> - * cover the whole range: [new_start, old_end)
> - */
> - vmg.middle = vma;
> - if (vma_expand(&vmg))
> - return -ENOMEM;
> -
> - /*
> - * move the page tables downwards, on failure we rely on
> - * process cleanup to remove whatever mess we made.
> - */
> - pmc.for_stack = true;
> - if (length != move_page_tables(&pmc))
> - return -ENOMEM;
> -
> - tlb_gather_mmu(&tlb, mm);
> - next = vma_next(&vmi);
> - if (new_end > old_start) {
> - /*
> - * when the old and new regions overlap clear from new_end.
> - */
> - free_pgd_range(&tlb, new_end, old_end, new_end,
> - next ? next->vm_start : USER_PGTABLES_CEILING);
> - } else {
> - /*
> - * otherwise, clean from old_start; this is done to not touch
> - * the address space in [new_end, old_start) some architectures
> - * have constraints on va-space that make this illegal (IA64) -
> - * for the others its just a little faster.
> - */
> - free_pgd_range(&tlb, old_start, old_end, new_end,
> - next ? next->vm_start : USER_PGTABLES_CEILING);
> - }
> - tlb_finish_mmu(&tlb);
> -
> - vma_prev(&vmi);
> - /* Shrink the vma to just the new range */
> - return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
> -}
> -
> #ifdef CONFIG_MMU
> /*
> * Obtain a read lock on mm->mmap_lock, if the specified address is below the
> diff --git a/mm/vma.h b/mm/vma.h
> index 149926e8a6d1..1ce3e18f01b7 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -548,4 +548,9 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address);
>
> int __vm_munmap(unsigned long start, size_t len, bool unlock);
>
> +/* vma_exec.h */
> +#ifdef CONFIG_MMU
> +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
> +#endif
> +
> #endif /* __MM_VMA_H */
> diff --git a/mm/vma_exec.c b/mm/vma_exec.c
> new file mode 100644
> index 000000000000..6736ae37f748
> --- /dev/null
> +++ b/mm/vma_exec.c
> @@ -0,0 +1,92 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +
> +/*
> + * Functions explicitly implemented for exec functionality which however are
> + * explicitly VMA-only logic.
> + */
> +
> +#include "vma_internal.h"
> +#include "vma.h"
> +
> +/*
> + * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
> + * this VMA and its relocated range, which will now reside at [vma->vm_start -
> + * shift, vma->vm_end - shift).
> + *
> + * This function is almost certainly NOT what you want for anything other than
> + * early executable temporary stack relocation.
> + */
> +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> +{
> + /*
> + * The process proceeds as follows:
> + *
> + * 1) Use shift to calculate the new vma endpoints.
> + * 2) Extend vma to cover both the old and new ranges. This ensures the
> + * arguments passed to subsequent functions are consistent.
> + * 3) Move vma's page tables to the new range.
> + * 4) Free up any cleared pgd range.
> + * 5) Shrink the vma to cover only the new range.
> + */
> +
> + struct mm_struct *mm = vma->vm_mm;
> + unsigned long old_start = vma->vm_start;
> + unsigned long old_end = vma->vm_end;
> + unsigned long length = old_end - old_start;
> + unsigned long new_start = old_start - shift;
> + unsigned long new_end = old_end - shift;
> + VMA_ITERATOR(vmi, mm, new_start);
> + VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
> + struct vm_area_struct *next;
> + struct mmu_gather tlb;
> + PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
> +
> + BUG_ON(new_start > new_end);
> +
> + /*
> + * ensure there are no vmas between where we want to go
> + * and where we are
> + */
> + if (vma != vma_next(&vmi))
> + return -EFAULT;
> +
> + vma_iter_prev_range(&vmi);
> + /*
> + * cover the whole range: [new_start, old_end)
> + */
> + vmg.middle = vma;
> + if (vma_expand(&vmg))
> + return -ENOMEM;
> +
> + /*
> + * move the page tables downwards, on failure we rely on
> + * process cleanup to remove whatever mess we made.
> + */
> + pmc.for_stack = true;
> + if (length != move_page_tables(&pmc))
> + return -ENOMEM;
> +
> + tlb_gather_mmu(&tlb, mm);
> + next = vma_next(&vmi);
> + if (new_end > old_start) {
> + /*
> + * when the old and new regions overlap clear from new_end.
> + */
> + free_pgd_range(&tlb, new_end, old_end, new_end,
> + next ? next->vm_start : USER_PGTABLES_CEILING);
> + } else {
> + /*
> + * otherwise, clean from old_start; this is done to not touch
> + * the address space in [new_end, old_start) some architectures
> + * have constraints on va-space that make this illegal (IA64) -
> + * for the others its just a little faster.
> + */
> + free_pgd_range(&tlb, old_start, old_end, new_end,
> + next ? next->vm_start : USER_PGTABLES_CEILING);
> + }
> + tlb_finish_mmu(&tlb);
> +
> + vma_prev(&vmi);
> + /* Shrink the vma to just the new range */
> + return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
> +}
> diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile
> index 860fd2311dcc..624040fcf193 100644
> --- a/tools/testing/vma/Makefile
> +++ b/tools/testing/vma/Makefile
> @@ -9,7 +9,7 @@ include ../shared/shared.mk
> OFILES = $(SHARED_OFILES) vma.o maple-shim.o
> TARGETS = vma
>
> -vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma.h
> +vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_exec.c ../../../mm/vma.h
>
> vma: $(OFILES)
> $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)
> diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
> index 7cfd6e31db10..5832ae5d797d 100644
> --- a/tools/testing/vma/vma.c
> +++ b/tools/testing/vma/vma.c
> @@ -28,6 +28,7 @@ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
> * Directly import the VMA implementation here. Our vma_internal.h wrapper
> * provides userland-equivalent functionality for everything vma.c uses.
> */
> +#include "../../../mm/vma_exec.c"
> #include "../../../mm/vma.c"
>
> const struct vm_operations_struct vma_dummy_vm_ops;
> diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> index 572ab2cea763..0df19ca0000a 100644
> --- a/tools/testing/vma/vma_internal.h
> +++ b/tools/testing/vma/vma_internal.h
> @@ -421,6 +421,28 @@ struct vm_unmapped_area_info {
> unsigned long start_gap;
> };
>
> +struct pagetable_move_control {
> + struct vm_area_struct *old; /* Source VMA. */
> + struct vm_area_struct *new; /* Destination VMA. */
> + unsigned long old_addr; /* Address from which the move begins. */
> + unsigned long old_end; /* Exclusive address at which old range ends. */
> + unsigned long new_addr; /* Address to move page tables to. */
> + unsigned long len_in; /* Bytes to remap specified by user. */
> +
> + bool need_rmap_locks; /* Do rmap locks need to be taken? */
> + bool for_stack; /* Is this an early temp stack being moved? */
> +};
> +
> +#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \
> + struct pagetable_move_control name = { \
> + .old = old_, \
> + .new = new_, \
> + .old_addr = old_addr_, \
> + .old_end = (old_addr_) + (len_), \
> + .new_addr = new_addr_, \
> + .len_in = len_, \
> + }
> +
> static inline void vma_iter_invalidate(struct vma_iterator *vmi)
> {
> mas_pause(&vmi->mas);
> @@ -1240,4 +1262,22 @@ static inline int mapping_map_writable(struct address_space *mapping)
> return 0;
> }
>
> +static inline unsigned long move_page_tables(struct pagetable_move_control *pmc)
> +{
> + (void)pmc;
> +
> + return 0;
> +}
> +
> +static inline void free_pgd_range(struct mmu_gather *tlb,
> + unsigned long addr, unsigned long end,
> + unsigned long floor, unsigned long ceiling)
> +{
> + (void)tlb;
> + (void)addr;
> + (void)end;
> + (void)floor;
> + (void)ceiling;
> +}
> +
> #endif /* __MM_VMA_INTERNAL_H */
> --
> 2.49.0
>
On Mon, Apr 28, 2025 at 12:20 PM Liam R. Howlett
<Liam.Howlett@oracle.com> wrote:
>
> * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250428 11:28]:
> > There is functionality that overlaps the exec and memory mapping
> > subsystems. While it properly belongs in mm, it is important that exec
> > maintainers maintain oversight of this functionality correctly.
> >
> > We can establish both goals by adding a new mm/vma_exec.c file which
> > contains these 'glue' functions, and have fs/exec.c import them.
> >
> > As a part of this change, to ensure that proper oversight is achieved, add
> > the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections.
> >
> > scripts/get_maintainer.pl can correctly handle files in multiple entries
> > and this neatly handles the cross-over.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>
> Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
>
> > ---
> > MAINTAINERS | 2 +
> > fs/exec.c | 3 ++
> > include/linux/mm.h | 1 -
> > mm/Makefile | 2 +-
> > mm/mmap.c | 83 ----------------------------
> > mm/vma.h | 5 ++
> > mm/vma_exec.c | 92 ++++++++++++++++++++++++++++++++
> > tools/testing/vma/Makefile | 2 +-
> > tools/testing/vma/vma.c | 1 +
> > tools/testing/vma/vma_internal.h | 40 ++++++++++++++
> > 10 files changed, 145 insertions(+), 86 deletions(-)
> > create mode 100644 mm/vma_exec.c
> >
> > diff --git a/MAINTAINERS b/MAINTAINERS
> > index f5ee0390cdee..1ee1c22e6e36 100644
> > --- a/MAINTAINERS
> > +++ b/MAINTAINERS
> > @@ -8830,6 +8830,7 @@ F: include/linux/elf.h
> > F: include/uapi/linux/auxvec.h
> > F: include/uapi/linux/binfmts.h
> > F: include/uapi/linux/elf.h
> > +F: mm/vma_exec.c
> > F: tools/testing/selftests/exec/
> > N: asm/elf.h
> > N: binfmt
> > @@ -15654,6 +15655,7 @@ F: mm/mremap.c
> > F: mm/mseal.c
> > F: mm/vma.c
> > F: mm/vma.h
> > +F: mm/vma_exec.c
> > F: mm/vma_internal.h
> > F: tools/testing/selftests/mm/merge.c
> > F: tools/testing/vma/
> > diff --git a/fs/exec.c b/fs/exec.c
> > index 8e4ea5f1e64c..477bc3f2e966 100644
> > --- a/fs/exec.c
> > +++ b/fs/exec.c
> > @@ -78,6 +78,9 @@
> >
> > #include <trace/events/sched.h>
> >
> > +/* For vma exec functions. */
> > +#include "../mm/internal.h"
> > +
> > static int bprm_creds_from_file(struct linux_binprm *bprm);
> >
> > int suid_dumpable = 0;
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 21dd110b6655..4fc361df9ad7 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -3223,7 +3223,6 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
> > extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
> > extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
> > extern void exit_mmap(struct mm_struct *);
> > -int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
> > bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
> > unsigned long addr, bool write);
> >
> > diff --git a/mm/Makefile b/mm/Makefile
> > index 9d7e5b5bb694..15a901bb431a 100644
> > --- a/mm/Makefile
> > +++ b/mm/Makefile
> > @@ -37,7 +37,7 @@ mmu-y := nommu.o
> > mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
> > mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
> > msync.o page_vma_mapped.o pagewalk.o \
> > - pgtable-generic.o rmap.o vmalloc.o vma.o
> > + pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o
> >
> >
> > ifdef CONFIG_CROSS_MEMORY_ATTACH
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index bd210aaf7ebd..1794bf6f4dc0 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1717,89 +1717,6 @@ static int __meminit init_reserve_notifier(void)
> > }
> > subsys_initcall(init_reserve_notifier);
> >
> > -/*
> > - * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
> > - * this VMA and its relocated range, which will now reside at [vma->vm_start -
> > - * shift, vma->vm_end - shift).
> > - *
> > - * This function is almost certainly NOT what you want for anything other than
> > - * early executable temporary stack relocation.
> > - */
> > -int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> > -{
> > - /*
> > - * The process proceeds as follows:
> > - *
> > - * 1) Use shift to calculate the new vma endpoints.
> > - * 2) Extend vma to cover both the old and new ranges. This ensures the
> > - * arguments passed to subsequent functions are consistent.
> > - * 3) Move vma's page tables to the new range.
> > - * 4) Free up any cleared pgd range.
> > - * 5) Shrink the vma to cover only the new range.
> > - */
> > -
> > - struct mm_struct *mm = vma->vm_mm;
> > - unsigned long old_start = vma->vm_start;
> > - unsigned long old_end = vma->vm_end;
> > - unsigned long length = old_end - old_start;
> > - unsigned long new_start = old_start - shift;
> > - unsigned long new_end = old_end - shift;
> > - VMA_ITERATOR(vmi, mm, new_start);
> > - VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
> > - struct vm_area_struct *next;
> > - struct mmu_gather tlb;
> > - PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
> > -
> > - BUG_ON(new_start > new_end);
> > -
> > - /*
> > - * ensure there are no vmas between where we want to go
> > - * and where we are
> > - */
> > - if (vma != vma_next(&vmi))
> > - return -EFAULT;
> > -
> > - vma_iter_prev_range(&vmi);
> > - /*
> > - * cover the whole range: [new_start, old_end)
> > - */
> > - vmg.middle = vma;
> > - if (vma_expand(&vmg))
> > - return -ENOMEM;
> > -
> > - /*
> > - * move the page tables downwards, on failure we rely on
> > - * process cleanup to remove whatever mess we made.
> > - */
> > - pmc.for_stack = true;
> > - if (length != move_page_tables(&pmc))
> > - return -ENOMEM;
> > -
> > - tlb_gather_mmu(&tlb, mm);
> > - next = vma_next(&vmi);
> > - if (new_end > old_start) {
> > - /*
> > - * when the old and new regions overlap clear from new_end.
> > - */
> > - free_pgd_range(&tlb, new_end, old_end, new_end,
> > - next ? next->vm_start : USER_PGTABLES_CEILING);
> > - } else {
> > - /*
> > - * otherwise, clean from old_start; this is done to not touch
> > - * the address space in [new_end, old_start) some architectures
> > - * have constraints on va-space that make this illegal (IA64) -
> > - * for the others its just a little faster.
> > - */
> > - free_pgd_range(&tlb, old_start, old_end, new_end,
> > - next ? next->vm_start : USER_PGTABLES_CEILING);
> > - }
> > - tlb_finish_mmu(&tlb);
> > -
> > - vma_prev(&vmi);
> > - /* Shrink the vma to just the new range */
> > - return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
> > -}
> > -
> > #ifdef CONFIG_MMU
> > /*
> > * Obtain a read lock on mm->mmap_lock, if the specified address is below the
> > diff --git a/mm/vma.h b/mm/vma.h
> > index 149926e8a6d1..1ce3e18f01b7 100644
> > --- a/mm/vma.h
> > +++ b/mm/vma.h
> > @@ -548,4 +548,9 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address);
> >
> > int __vm_munmap(unsigned long start, size_t len, bool unlock);
> >
> > +/* vma_exec.h */
nit: Did you mean vma_exec.c ?
> > +#ifdef CONFIG_MMU
> > +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
> > +#endif
> > +
> > #endif /* __MM_VMA_H */
> > diff --git a/mm/vma_exec.c b/mm/vma_exec.c
> > new file mode 100644
> > index 000000000000..6736ae37f748
> > --- /dev/null
> > +++ b/mm/vma_exec.c
> > @@ -0,0 +1,92 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +
> > +/*
> > + * Functions explicitly implemented for exec functionality which however are
> > + * explicitly VMA-only logic.
> > + */
> > +
> > +#include "vma_internal.h"
> > +#include "vma.h"
> > +
> > +/*
> > + * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
> > + * this VMA and its relocated range, which will now reside at [vma->vm_start -
> > + * shift, vma->vm_end - shift).
> > + *
> > + * This function is almost certainly NOT what you want for anything other than
> > + * early executable temporary stack relocation.
> > + */
> > +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> > +{
> > + /*
> > + * The process proceeds as follows:
> > + *
> > + * 1) Use shift to calculate the new vma endpoints.
> > + * 2) Extend vma to cover both the old and new ranges. This ensures the
> > + * arguments passed to subsequent functions are consistent.
> > + * 3) Move vma's page tables to the new range.
> > + * 4) Free up any cleared pgd range.
> > + * 5) Shrink the vma to cover only the new range.
> > + */
> > +
> > + struct mm_struct *mm = vma->vm_mm;
> > + unsigned long old_start = vma->vm_start;
> > + unsigned long old_end = vma->vm_end;
> > + unsigned long length = old_end - old_start;
> > + unsigned long new_start = old_start - shift;
> > + unsigned long new_end = old_end - shift;
> > + VMA_ITERATOR(vmi, mm, new_start);
> > + VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
> > + struct vm_area_struct *next;
> > + struct mmu_gather tlb;
> > + PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
> > +
> > + BUG_ON(new_start > new_end);
> > +
> > + /*
> > + * ensure there are no vmas between where we want to go
> > + * and where we are
> > + */
> > + if (vma != vma_next(&vmi))
> > + return -EFAULT;
> > +
> > + vma_iter_prev_range(&vmi);
> > + /*
> > + * cover the whole range: [new_start, old_end)
> > + */
> > + vmg.middle = vma;
> > + if (vma_expand(&vmg))
> > + return -ENOMEM;
> > +
> > + /*
> > + * move the page tables downwards, on failure we rely on
> > + * process cleanup to remove whatever mess we made.
> > + */
> > + pmc.for_stack = true;
> > + if (length != move_page_tables(&pmc))
> > + return -ENOMEM;
> > +
> > + tlb_gather_mmu(&tlb, mm);
> > + next = vma_next(&vmi);
> > + if (new_end > old_start) {
> > + /*
> > + * when the old and new regions overlap clear from new_end.
> > + */
> > + free_pgd_range(&tlb, new_end, old_end, new_end,
> > + next ? next->vm_start : USER_PGTABLES_CEILING);
> > + } else {
> > + /*
> > + * otherwise, clean from old_start; this is done to not touch
> > + * the address space in [new_end, old_start) some architectures
> > + * have constraints on va-space that make this illegal (IA64) -
> > + * for the others its just a little faster.
> > + */
> > + free_pgd_range(&tlb, old_start, old_end, new_end,
> > + next ? next->vm_start : USER_PGTABLES_CEILING);
> > + }
> > + tlb_finish_mmu(&tlb);
> > +
> > + vma_prev(&vmi);
> > + /* Shrink the vma to just the new range */
> > + return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
> > +}
> > diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile
> > index 860fd2311dcc..624040fcf193 100644
> > --- a/tools/testing/vma/Makefile
> > +++ b/tools/testing/vma/Makefile
> > @@ -9,7 +9,7 @@ include ../shared/shared.mk
> > OFILES = $(SHARED_OFILES) vma.o maple-shim.o
> > TARGETS = vma
> >
> > -vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma.h
> > +vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_exec.c ../../../mm/vma.h
> >
> > vma: $(OFILES)
> > $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)
> > diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
> > index 7cfd6e31db10..5832ae5d797d 100644
> > --- a/tools/testing/vma/vma.c
> > +++ b/tools/testing/vma/vma.c
> > @@ -28,6 +28,7 @@ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
> > * Directly import the VMA implementation here. Our vma_internal.h wrapper
> > * provides userland-equivalent functionality for everything vma.c uses.
> > */
> > +#include "../../../mm/vma_exec.c"
> > #include "../../../mm/vma.c"
> >
> > const struct vm_operations_struct vma_dummy_vm_ops;
> > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> > index 572ab2cea763..0df19ca0000a 100644
> > --- a/tools/testing/vma/vma_internal.h
> > +++ b/tools/testing/vma/vma_internal.h
> > @@ -421,6 +421,28 @@ struct vm_unmapped_area_info {
> > unsigned long start_gap;
> > };
> >
> > +struct pagetable_move_control {
> > + struct vm_area_struct *old; /* Source VMA. */
> > + struct vm_area_struct *new; /* Destination VMA. */
> > + unsigned long old_addr; /* Address from which the move begins. */
> > + unsigned long old_end; /* Exclusive address at which old range ends. */
> > + unsigned long new_addr; /* Address to move page tables to. */
> > + unsigned long len_in; /* Bytes to remap specified by user. */
> > +
> > + bool need_rmap_locks; /* Do rmap locks need to be taken? */
> > + bool for_stack; /* Is this an early temp stack being moved? */
> > +};
> > +
> > +#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \
> > + struct pagetable_move_control name = { \
> > + .old = old_, \
> > + .new = new_, \
> > + .old_addr = old_addr_, \
> > + .old_end = (old_addr_) + (len_), \
> > + .new_addr = new_addr_, \
> > + .len_in = len_, \
> > + }
> > +
> > static inline void vma_iter_invalidate(struct vma_iterator *vmi)
> > {
> > mas_pause(&vmi->mas);
> > @@ -1240,4 +1262,22 @@ static inline int mapping_map_writable(struct address_space *mapping)
> > return 0;
> > }
> >
> > +static inline unsigned long move_page_tables(struct pagetable_move_control *pmc)
> > +{
> > + (void)pmc;
> > +
> > + return 0;
> > +}
> > +
> > +static inline void free_pgd_range(struct mmu_gather *tlb,
> > + unsigned long addr, unsigned long end,
> > + unsigned long floor, unsigned long ceiling)
> > +{
> > + (void)tlb;
> > + (void)addr;
> > + (void)end;
> > + (void)floor;
> > + (void)ceiling;
> > +}
> > +
> > #endif /* __MM_VMA_INTERNAL_H */
> > --
> > 2.49.0
> >
Andrew - I typo'd /* vma_exec.h */ below in the change to mm/vma.h - would it be
possible to correct to vma_exec.c, or would a fixpatch make life easier?
Cheers, Lorenzo
On Mon, Apr 28, 2025 at 01:14:31PM -0700, Suren Baghdasaryan wrote:
> On Mon, Apr 28, 2025 at 12:20 PM Liam R. Howlett
> <Liam.Howlett@oracle.com> wrote:
> >
> > * Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250428 11:28]:
> > > There is functionality that overlaps the exec and memory mapping
> > > subsystems. While it properly belongs in mm, it is important that exec
> > > maintainers maintain oversight of this functionality correctly.
> > >
> > > We can establish both goals by adding a new mm/vma_exec.c file which
> > > contains these 'glue' functions, and have fs/exec.c import them.
> > >
> > > As a part of this change, to ensure that proper oversight is achieved, add
> > > the file to both the MEMORY MAPPING and EXEC & BINFMT API, ELF sections.
> > >
> > > scripts/get_maintainer.pl can correctly handle files in multiple entries
> > > and this neatly handles the cross-over.
> > >
> > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> >
> > Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
>
> Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Thanks!
>
> >
> > > ---
> > > MAINTAINERS | 2 +
> > > fs/exec.c | 3 ++
> > > include/linux/mm.h | 1 -
> > > mm/Makefile | 2 +-
> > > mm/mmap.c | 83 ----------------------------
> > > mm/vma.h | 5 ++
> > > mm/vma_exec.c | 92 ++++++++++++++++++++++++++++++++
> > > tools/testing/vma/Makefile | 2 +-
> > > tools/testing/vma/vma.c | 1 +
> > > tools/testing/vma/vma_internal.h | 40 ++++++++++++++
> > > 10 files changed, 145 insertions(+), 86 deletions(-)
> > > create mode 100644 mm/vma_exec.c
> > >
> > > diff --git a/MAINTAINERS b/MAINTAINERS
> > > index f5ee0390cdee..1ee1c22e6e36 100644
> > > --- a/MAINTAINERS
> > > +++ b/MAINTAINERS
> > > @@ -8830,6 +8830,7 @@ F: include/linux/elf.h
> > > F: include/uapi/linux/auxvec.h
> > > F: include/uapi/linux/binfmts.h
> > > F: include/uapi/linux/elf.h
> > > +F: mm/vma_exec.c
> > > F: tools/testing/selftests/exec/
> > > N: asm/elf.h
> > > N: binfmt
> > > @@ -15654,6 +15655,7 @@ F: mm/mremap.c
> > > F: mm/mseal.c
> > > F: mm/vma.c
> > > F: mm/vma.h
> > > +F: mm/vma_exec.c
> > > F: mm/vma_internal.h
> > > F: tools/testing/selftests/mm/merge.c
> > > F: tools/testing/vma/
> > > diff --git a/fs/exec.c b/fs/exec.c
> > > index 8e4ea5f1e64c..477bc3f2e966 100644
> > > --- a/fs/exec.c
> > > +++ b/fs/exec.c
> > > @@ -78,6 +78,9 @@
> > >
> > > #include <trace/events/sched.h>
> > >
> > > +/* For vma exec functions. */
> > > +#include "../mm/internal.h"
> > > +
> > > static int bprm_creds_from_file(struct linux_binprm *bprm);
> > >
> > > int suid_dumpable = 0;
> > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > index 21dd110b6655..4fc361df9ad7 100644
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -3223,7 +3223,6 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
> > > extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
> > > extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
> > > extern void exit_mmap(struct mm_struct *);
> > > -int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
> > > bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
> > > unsigned long addr, bool write);
> > >
> > > diff --git a/mm/Makefile b/mm/Makefile
> > > index 9d7e5b5bb694..15a901bb431a 100644
> > > --- a/mm/Makefile
> > > +++ b/mm/Makefile
> > > @@ -37,7 +37,7 @@ mmu-y := nommu.o
> > > mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
> > > mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
> > > msync.o page_vma_mapped.o pagewalk.o \
> > > - pgtable-generic.o rmap.o vmalloc.o vma.o
> > > + pgtable-generic.o rmap.o vmalloc.o vma.o vma_exec.o
> > >
> > >
> > > ifdef CONFIG_CROSS_MEMORY_ATTACH
> > > diff --git a/mm/mmap.c b/mm/mmap.c
> > > index bd210aaf7ebd..1794bf6f4dc0 100644
> > > --- a/mm/mmap.c
> > > +++ b/mm/mmap.c
> > > @@ -1717,89 +1717,6 @@ static int __meminit init_reserve_notifier(void)
> > > }
> > > subsys_initcall(init_reserve_notifier);
> > >
> > > -/*
> > > - * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
> > > - * this VMA and its relocated range, which will now reside at [vma->vm_start -
> > > - * shift, vma->vm_end - shift).
> > > - *
> > > - * This function is almost certainly NOT what you want for anything other than
> > > - * early executable temporary stack relocation.
> > > - */
> > > -int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> > > -{
> > > - /*
> > > - * The process proceeds as follows:
> > > - *
> > > - * 1) Use shift to calculate the new vma endpoints.
> > > - * 2) Extend vma to cover both the old and new ranges. This ensures the
> > > - * arguments passed to subsequent functions are consistent.
> > > - * 3) Move vma's page tables to the new range.
> > > - * 4) Free up any cleared pgd range.
> > > - * 5) Shrink the vma to cover only the new range.
> > > - */
> > > -
> > > - struct mm_struct *mm = vma->vm_mm;
> > > - unsigned long old_start = vma->vm_start;
> > > - unsigned long old_end = vma->vm_end;
> > > - unsigned long length = old_end - old_start;
> > > - unsigned long new_start = old_start - shift;
> > > - unsigned long new_end = old_end - shift;
> > > - VMA_ITERATOR(vmi, mm, new_start);
> > > - VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
> > > - struct vm_area_struct *next;
> > > - struct mmu_gather tlb;
> > > - PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
> > > -
> > > - BUG_ON(new_start > new_end);
> > > -
> > > - /*
> > > - * ensure there are no vmas between where we want to go
> > > - * and where we are
> > > - */
> > > - if (vma != vma_next(&vmi))
> > > - return -EFAULT;
> > > -
> > > - vma_iter_prev_range(&vmi);
> > > - /*
> > > - * cover the whole range: [new_start, old_end)
> > > - */
> > > - vmg.middle = vma;
> > > - if (vma_expand(&vmg))
> > > - return -ENOMEM;
> > > -
> > > - /*
> > > - * move the page tables downwards, on failure we rely on
> > > - * process cleanup to remove whatever mess we made.
> > > - */
> > > - pmc.for_stack = true;
> > > - if (length != move_page_tables(&pmc))
> > > - return -ENOMEM;
> > > -
> > > - tlb_gather_mmu(&tlb, mm);
> > > - next = vma_next(&vmi);
> > > - if (new_end > old_start) {
> > > - /*
> > > - * when the old and new regions overlap clear from new_end.
> > > - */
> > > - free_pgd_range(&tlb, new_end, old_end, new_end,
> > > - next ? next->vm_start : USER_PGTABLES_CEILING);
> > > - } else {
> > > - /*
> > > - * otherwise, clean from old_start; this is done to not touch
> > > - * the address space in [new_end, old_start) some architectures
> > > - * have constraints on va-space that make this illegal (IA64) -
> > > - * for the others its just a little faster.
> > > - */
> > > - free_pgd_range(&tlb, old_start, old_end, new_end,
> > > - next ? next->vm_start : USER_PGTABLES_CEILING);
> > > - }
> > > - tlb_finish_mmu(&tlb);
> > > -
> > > - vma_prev(&vmi);
> > > - /* Shrink the vma to just the new range */
> > > - return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
> > > -}
> > > -
> > > #ifdef CONFIG_MMU
> > > /*
> > > * Obtain a read lock on mm->mmap_lock, if the specified address is below the
> > > diff --git a/mm/vma.h b/mm/vma.h
> > > index 149926e8a6d1..1ce3e18f01b7 100644
> > > --- a/mm/vma.h
> > > +++ b/mm/vma.h
> > > @@ -548,4 +548,9 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address);
> > >
> > > int __vm_munmap(unsigned long start, size_t len, bool unlock);
> > >
> > > +/* vma_exec.h */
>
> nit: Did you mean vma_exec.c ?
Oops yeah, I did the same for vma_init.[ch] too lol, so at least consistent...
>
> > > +#ifdef CONFIG_MMU
> > > +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
> > > +#endif
> > > +
> > > #endif /* __MM_VMA_H */
> > > diff --git a/mm/vma_exec.c b/mm/vma_exec.c
> > > new file mode 100644
> > > index 000000000000..6736ae37f748
> > > --- /dev/null
> > > +++ b/mm/vma_exec.c
> > > @@ -0,0 +1,92 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +
> > > +/*
> > > + * Functions explicitly implemented for exec functionality which however are
> > > + * explicitly VMA-only logic.
> > > + */
> > > +
> > > +#include "vma_internal.h"
> > > +#include "vma.h"
> > > +
> > > +/*
> > > + * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
> > > + * this VMA and its relocated range, which will now reside at [vma->vm_start -
> > > + * shift, vma->vm_end - shift).
> > > + *
> > > + * This function is almost certainly NOT what you want for anything other than
> > > + * early executable temporary stack relocation.
> > > + */
> > > +int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> > > +{
> > > + /*
> > > + * The process proceeds as follows:
> > > + *
> > > + * 1) Use shift to calculate the new vma endpoints.
> > > + * 2) Extend vma to cover both the old and new ranges. This ensures the
> > > + * arguments passed to subsequent functions are consistent.
> > > + * 3) Move vma's page tables to the new range.
> > > + * 4) Free up any cleared pgd range.
> > > + * 5) Shrink the vma to cover only the new range.
> > > + */
> > > +
> > > + struct mm_struct *mm = vma->vm_mm;
> > > + unsigned long old_start = vma->vm_start;
> > > + unsigned long old_end = vma->vm_end;
> > > + unsigned long length = old_end - old_start;
> > > + unsigned long new_start = old_start - shift;
> > > + unsigned long new_end = old_end - shift;
> > > + VMA_ITERATOR(vmi, mm, new_start);
> > > + VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
> > > + struct vm_area_struct *next;
> > > + struct mmu_gather tlb;
> > > + PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);
> > > +
> > > + BUG_ON(new_start > new_end);
> > > +
> > > + /*
> > > + * ensure there are no vmas between where we want to go
> > > + * and where we are
> > > + */
> > > + if (vma != vma_next(&vmi))
> > > + return -EFAULT;
> > > +
> > > + vma_iter_prev_range(&vmi);
> > > + /*
> > > + * cover the whole range: [new_start, old_end)
> > > + */
> > > + vmg.middle = vma;
> > > + if (vma_expand(&vmg))
> > > + return -ENOMEM;
> > > +
> > > + /*
> > > + * move the page tables downwards, on failure we rely on
> > > + * process cleanup to remove whatever mess we made.
> > > + */
> > > + pmc.for_stack = true;
> > > + if (length != move_page_tables(&pmc))
> > > + return -ENOMEM;
> > > +
> > > + tlb_gather_mmu(&tlb, mm);
> > > + next = vma_next(&vmi);
> > > + if (new_end > old_start) {
> > > + /*
> > > + * when the old and new regions overlap clear from new_end.
> > > + */
> > > + free_pgd_range(&tlb, new_end, old_end, new_end,
> > > + next ? next->vm_start : USER_PGTABLES_CEILING);
> > > + } else {
> > > + /*
> > > + * otherwise, clean from old_start; this is done to not touch
> > > + * the address space in [new_end, old_start) some architectures
> > > + * have constraints on va-space that make this illegal (IA64) -
> > > + * for the others its just a little faster.
> > > + */
> > > + free_pgd_range(&tlb, old_start, old_end, new_end,
> > > + next ? next->vm_start : USER_PGTABLES_CEILING);
> > > + }
> > > + tlb_finish_mmu(&tlb);
> > > +
> > > + vma_prev(&vmi);
> > > + /* Shrink the vma to just the new range */
> > > + return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
> > > +}
> > > diff --git a/tools/testing/vma/Makefile b/tools/testing/vma/Makefile
> > > index 860fd2311dcc..624040fcf193 100644
> > > --- a/tools/testing/vma/Makefile
> > > +++ b/tools/testing/vma/Makefile
> > > @@ -9,7 +9,7 @@ include ../shared/shared.mk
> > > OFILES = $(SHARED_OFILES) vma.o maple-shim.o
> > > TARGETS = vma
> > >
> > > -vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma.h
> > > +vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_exec.c ../../../mm/vma.h
> > >
> > > vma: $(OFILES)
> > > $(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)
> > > diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c
> > > index 7cfd6e31db10..5832ae5d797d 100644
> > > --- a/tools/testing/vma/vma.c
> > > +++ b/tools/testing/vma/vma.c
> > > @@ -28,6 +28,7 @@ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
> > > * Directly import the VMA implementation here. Our vma_internal.h wrapper
> > > * provides userland-equivalent functionality for everything vma.c uses.
> > > */
> > > +#include "../../../mm/vma_exec.c"
> > > #include "../../../mm/vma.c"
> > >
> > > const struct vm_operations_struct vma_dummy_vm_ops;
> > > diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
> > > index 572ab2cea763..0df19ca0000a 100644
> > > --- a/tools/testing/vma/vma_internal.h
> > > +++ b/tools/testing/vma/vma_internal.h
> > > @@ -421,6 +421,28 @@ struct vm_unmapped_area_info {
> > > unsigned long start_gap;
> > > };
> > >
> > > +struct pagetable_move_control {
> > > + struct vm_area_struct *old; /* Source VMA. */
> > > + struct vm_area_struct *new; /* Destination VMA. */
> > > + unsigned long old_addr; /* Address from which the move begins. */
> > > + unsigned long old_end; /* Exclusive address at which old range ends. */
> > > + unsigned long new_addr; /* Address to move page tables to. */
> > > + unsigned long len_in; /* Bytes to remap specified by user. */
> > > +
> > > + bool need_rmap_locks; /* Do rmap locks need to be taken? */
> > > + bool for_stack; /* Is this an early temp stack being moved? */
> > > +};
> > > +
> > > +#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_) \
> > > + struct pagetable_move_control name = { \
> > > + .old = old_, \
> > > + .new = new_, \
> > > + .old_addr = old_addr_, \
> > > + .old_end = (old_addr_) + (len_), \
> > > + .new_addr = new_addr_, \
> > > + .len_in = len_, \
> > > + }
> > > +
> > > static inline void vma_iter_invalidate(struct vma_iterator *vmi)
> > > {
> > > mas_pause(&vmi->mas);
> > > @@ -1240,4 +1262,22 @@ static inline int mapping_map_writable(struct address_space *mapping)
> > > return 0;
> > > }
> > >
> > > +static inline unsigned long move_page_tables(struct pagetable_move_control *pmc)
> > > +{
> > > + (void)pmc;
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static inline void free_pgd_range(struct mmu_gather *tlb,
> > > + unsigned long addr, unsigned long end,
> > > + unsigned long floor, unsigned long ceiling)
> > > +{
> > > + (void)tlb;
> > > + (void)addr;
> > > + (void)end;
> > > + (void)floor;
> > > + (void)ceiling;
> > > +}
> > > +
> > > #endif /* __MM_VMA_INTERNAL_H */
> > > --
> > > 2.49.0
> > >
On Mon, 28 Apr 2025 21:26:29 +0100 Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > Andrew - I typo'd /* vma_exec.h */ below in the change to mm/vma.h - would it be > possible to correct to vma_exec.c, or would a fixpatch make life easier? > I did this: --- a/mm/vma.h~mm-establish-mm-vma_execc-for-shared-exec-mm-vma-functionality-fix +++ a/mm/vma.h @@ -548,7 +548,7 @@ int expand_downwards(struct vm_area_stru int __vm_munmap(unsigned long start, size_t len, bool unlock); -/* vma_exec.h */ +/* vma_exec.c */ #ifdef CONFIG_MMU int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); #endif _
© 2016 - 2025 Red Hat, Inc.