From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
Instead of zeroing the vma tree and then overwriting the area, let the
area be overwritten and then clean up the gathered vmas using
vms_complete_munmap_vmas().
If a driver is mapping over an existing vma, then clear the ptes before
the call_mmap() invocation. This is done using the vms_clear_ptes()
helper.
Temporarily keep track of the number of pages that will be removed and
reduce the charged amount.
This also drops the validate_mm() call in the vma_expand() function.
It is necessary to drop the validate as it would fail since the mm
map_count would be incorrect during a vma expansion, prior to the
cleanup from vms_complete_munmap_vmas().
Clean up the error handing of the vms_gather_munmap_vmas() by calling
the verification within the function.
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
---
mm/internal.h | 1 +
mm/mmap.c | 80 +++++++++++++++++++++++++++------------------------
2 files changed, 44 insertions(+), 37 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index 11e90c6e5a3e..dd4eede1be0f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1503,6 +1503,7 @@ struct vma_munmap_struct {
unsigned long stack_vm;
unsigned long data_vm;
bool unlock; /* Unlock after the munmap */
+ bool clear_ptes; /* If there are outstanding PTE to be cleared */
};
void __meminit __init_single_page(struct page *page, unsigned long pfn,
diff --git a/mm/mmap.c b/mm/mmap.c
index 870c2d04ad6b..58cf42e22bfe 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -401,17 +401,21 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
}
static unsigned long count_vma_pages_range(struct mm_struct *mm,
- unsigned long addr, unsigned long end)
+ unsigned long addr, unsigned long end,
+ unsigned long *nr_accounted)
{
VMA_ITERATOR(vmi, mm, addr);
struct vm_area_struct *vma;
unsigned long nr_pages = 0;
+ *nr_accounted = 0;
for_each_vma_range(vmi, vma, end) {
unsigned long vm_start = max(addr, vma->vm_start);
unsigned long vm_end = min(end, vma->vm_end);
nr_pages += PHYS_PFN(vm_end - vm_start);
+ if (vma->vm_flags & VM_ACCOUNT)
+ *nr_accounted += PHYS_PFN(vm_end - vm_start);
}
return nr_pages;
@@ -524,6 +528,7 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
vms->unmap_start = FIRST_USER_ADDRESS;
vms->unmap_end = USER_PGTABLES_CEILING;
+ vms->clear_ptes = false; /* No PTEs to clear yet */
}
/*
@@ -732,7 +737,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma_iter_store(vmi, vma);
vma_complete(&vp, vmi, vma->vm_mm);
- validate_mm(vma->vm_mm);
return 0;
nomem:
@@ -2606,11 +2610,14 @@ static inline void abort_munmap_vmas(struct ma_state *mas_detach)
}
-static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
+static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
struct ma_state *mas_detach, bool mm_wr_locked)
{
struct mmu_gather tlb;
+ if (!vms->clear_ptes) /* Nothing to do */
+ return;
+
/*
* We can free page tables without write-locking mmap_lock because VMAs
* were isolated before we downgraded mmap_lock.
@@ -2624,6 +2631,7 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
/* start and end may be different if there is no prev or next vma. */
free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked);
tlb_finish_mmu(&tlb);
+ vms->clear_ptes = false;
}
/*
@@ -2647,7 +2655,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
if (vms->unlock)
mmap_write_downgrade(mm);
- vms_complete_pte_clear(vms, mas_detach, !vms->unlock);
+ vms_clear_ptes(vms, mas_detach, !vms->unlock);
/* Update high watermark before we lower total_vm */
update_hiwater_vm(mm);
/* Stat accounting */
@@ -2799,6 +2807,9 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
while (vma_iter_addr(vms->vmi) > vms->start)
vma_iter_prev_range(vms->vmi);
+ /* There are now PTEs that need to be cleared */
+ vms->clear_ptes = true;
+
return 0;
userfaultfd_error:
@@ -2807,6 +2818,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
abort_munmap_vmas(mas_detach);
start_split_failed:
map_count_exceeded:
+ validate_mm(vms->mm);
return error;
}
@@ -2851,8 +2863,8 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
clear_tree_failed:
abort_munmap_vmas(&mas_detach);
-gather_failed:
validate_mm(mm);
+gather_failed:
return error;
}
@@ -2940,24 +2952,19 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long merge_start = addr, merge_end = end;
bool writable_file_mapping = false;
pgoff_t vm_pgoff;
- int error;
+ int error = -ENOMEM;
VMA_ITERATOR(vmi, mm, addr);
+ unsigned long nr_pages, nr_accounted;
- /* Check against address space limit. */
- if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
- unsigned long nr_pages;
-
- /*
- * MAP_FIXED may remove pages of mappings that intersects with
- * requested mapping. Account for the pages it would unmap.
- */
- nr_pages = count_vma_pages_range(mm, addr, end);
-
- if (!may_expand_vm(mm, vm_flags,
- (len >> PAGE_SHIFT) - nr_pages))
- return -ENOMEM;
- }
+ nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted);
+ /*
+ * Check against address space limit.
+ * MAP_FIXED may remove pages of mappings that intersects with requested
+ * mapping. Account for the pages it would unmap.
+ */
+ if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages))
+ return -ENOMEM;
if (unlikely(!can_modify_mm(mm, addr, end)))
return -EPERM;
@@ -2974,18 +2981,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
mas_init(&mas_detach, &mt_detach, /* addr = */ 0);
/* Prepare to unmap any existing mapping in the area */
if (vms_gather_munmap_vmas(&vms, &mas_detach))
- goto gather_failed;
-
- /* Remove any existing mappings from the vma tree */
- if (vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL))
- goto clear_tree_failed;
+ return -ENOMEM;
- /* Unmap any existing mapping in the area */
- vms_complete_munmap_vmas(&vms, &mas_detach);
next = vms.next;
prev = vms.prev;
vma = NULL;
} else {
+ /* Minimal setup of vms */
next = vma_next(&vmi);
prev = vma_prev(&vmi);
if (prev)
@@ -2997,8 +2999,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
*/
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
+ charged -= nr_accounted;
if (security_vm_enough_memory_mm(mm, charged))
- return -ENOMEM;
+ goto abort_munmap;
+ vms.nr_accounted = 0;
vm_flags |= VM_ACCOUNT;
}
@@ -3047,10 +3051,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* not unmapped, but the maps are removed from the list.
*/
vma = vm_area_alloc(mm);
- if (!vma) {
- error = -ENOMEM;
+ if (!vma)
goto unacct_error;
- }
vma_iter_config(&vmi, addr, end);
vma_set_range(vma, addr, end, pgoff);
@@ -3059,6 +3061,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
if (file) {
vma->vm_file = get_file(file);
+ /* call_mmap() may map PTE, so ensure there are no existing PTEs */
+ vms_clear_ptes(&vms, &mas_detach, true);
error = call_mmap(file, vma);
if (error)
goto unmap_and_free_vma;
@@ -3149,6 +3153,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
expanded:
perf_event_mmap(vma);
+ /* Unmap any existing mapping in the area */
+ if (vms.nr_pages)
+ vms_complete_munmap_vmas(&vms, &mas_detach);
+
vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
@@ -3196,14 +3204,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unacct_error:
if (charged)
vm_unacct_memory(charged);
- validate_mm(mm);
- return error;
-clear_tree_failed:
- abort_munmap_vmas(&mas_detach);
-gather_failed:
+abort_munmap:
+ if (vms.nr_pages)
+ abort_munmap_vmas(&mas_detach);
validate_mm(mm);
- return -ENOMEM;
+ return error;
}
static int __vm_munmap(unsigned long start, size_t len, bool unlock)
--
2.43.0
Hello,
kernel test robot noticed "ltp.hugemmap06.fail" on:
commit: d793398401db9fb81084bd4fe2f782342201df18 ("[PATCH v4 14/21] mm/mmap: Avoid zeroing vma tree in mmap_region()")
url: https://github.com/intel-lab-lkp/linux/commits/Liam-R-Howlett/mm-mmap-Correctly-position-vma_iterator-in-__split_vma/20240711-075019
base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/all/20240710192250.4114783-15-Liam.Howlett@oracle.com/
patch subject: [PATCH v4 14/21] mm/mmap: Avoid zeroing vma tree in mmap_region()
in testcase: ltp
version: ltp-x86_64-14c1f76-1_20240706
with following parameters:
test: hugetlb/hugemmap06
compiler: gcc-13
test machine: 8 threads 1 sockets Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz (Ivy Bridge) with 16G memory
(please refer to attached dmesg/kmsg for entire log/backtrace)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202407162022.5a730c37-oliver.sang@intel.com
Running tests.......
<<<test_start>>>
tag=hugemmap06 stime=1721029963
cmdline="hugemmap06"
contacts=""
analysis=exit
<<<test_output>>>
tst_hugepage.c:84: TINFO: 255 hugepage(s) reserved
tst_test.c:1803: TINFO: LTP version: 20240524-71-g361f6ad13
tst_test.c:1647: TINFO: Timeout per run is 0h 00m 30s
hugemmap06.c:114: TPASS: No regression found
hugemmap06.c:114: TPASS: No regression found
hugemmap06.c:114: TPASS: No regression found
hugemmap06.c:114: TPASS: No regression found
hugemmap06.c:100: TFAIL: mmap failed: ENOMEM (12)
HINT: You _MAY_ be missing kernel fixes:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f522c3ac00a4
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9119a41e9091
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7b24d8616be3
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1406ec9ba6c6
Summary:
passed 4
failed 1
broken 0
skipped 0
warnings 0
incrementing stop
<<<execution_status>>>
initiation_status="ok"
duration=10 termination_type=exited termination_id=1 corefile=no
cutime=2 cstime=629
<<<test_end>>>
INFO: ltp-pan reported some tests FAIL
LTP Version: 20240524-71-g361f6ad13
###############################################################
Done executing testcases.
LTP Version: 20240524-71-g361f6ad13
###############################################################
The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20240716/202407162022.5a730c37-oliver.sang@intel.com
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
* kernel test robot <oliver.sang@intel.com> [240716 08:47]:
>
>
> Hello,
>
> kernel test robot noticed "ltp.hugemmap06.fail" on:
Hello Robot!
Thank you for finding this, it will certainly help me improve my next
revision of my series!
>
> commit: d793398401db9fb81084bd4fe2f782342201df18 ("[PATCH v4 14/21] mm/mmap: Avoid zeroing vma tree in mmap_region()")
> url: https://github.com/intel-lab-lkp/linux/commits/Liam-R-Howlett/mm-mmap-Correctly-position-vma_iterator-in-__split_vma/20240711-075019
> base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything
> patch link: https://lore.kernel.org/all/20240710192250.4114783-15-Liam.Howlett@oracle.com/
> patch subject: [PATCH v4 14/21] mm/mmap: Avoid zeroing vma tree in mmap_region()
>
> in testcase: ltp
> version: ltp-x86_64-14c1f76-1_20240706
> with following parameters:
>
> test: hugetlb/hugemmap06
>
>
>
This is because I am trying to set up a MAP_FIXED huge page before
hugetlb_vm_op_close() is called, which removes the reserved huge pages.
I will address this in v5.
On Wed, Jul 10, 2024 at 03:22:43PM GMT, Liam R. Howlett wrote:
> From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
>
> Instead of zeroing the vma tree and then overwriting the area, let the
> area be overwritten and then clean up the gathered vmas using
> vms_complete_munmap_vmas().
>
> If a driver is mapping over an existing vma, then clear the ptes before
> the call_mmap() invocation. This is done using the vms_clear_ptes()
> helper.
>
> Temporarily keep track of the number of pages that will be removed and
> reduce the charged amount.
>
> This also drops the validate_mm() call in the vma_expand() function.
> It is necessary to drop the validate as it would fail since the mm
> map_count would be incorrect during a vma expansion, prior to the
> cleanup from vms_complete_munmap_vmas().
>
> Clean up the error handing of the vms_gather_munmap_vmas() by calling
> the verification within the function.
>
> Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
> ---
> mm/internal.h | 1 +
> mm/mmap.c | 80 +++++++++++++++++++++++++++------------------------
> 2 files changed, 44 insertions(+), 37 deletions(-)
>
> diff --git a/mm/internal.h b/mm/internal.h
> index 11e90c6e5a3e..dd4eede1be0f 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -1503,6 +1503,7 @@ struct vma_munmap_struct {
> unsigned long stack_vm;
> unsigned long data_vm;
> bool unlock; /* Unlock after the munmap */
> + bool clear_ptes; /* If there are outstanding PTE to be cleared */
> };
>
> void __meminit __init_single_page(struct page *page, unsigned long pfn,
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 870c2d04ad6b..58cf42e22bfe 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -401,17 +401,21 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
> }
>
> static unsigned long count_vma_pages_range(struct mm_struct *mm,
> - unsigned long addr, unsigned long end)
> + unsigned long addr, unsigned long end,
> + unsigned long *nr_accounted)
> {
> VMA_ITERATOR(vmi, mm, addr);
> struct vm_area_struct *vma;
> unsigned long nr_pages = 0;
>
> + *nr_accounted = 0;
> for_each_vma_range(vmi, vma, end) {
> unsigned long vm_start = max(addr, vma->vm_start);
> unsigned long vm_end = min(end, vma->vm_end);
>
> nr_pages += PHYS_PFN(vm_end - vm_start);
> + if (vma->vm_flags & VM_ACCOUNT)
> + *nr_accounted += PHYS_PFN(vm_end - vm_start);
> }
>
> return nr_pages;
> @@ -524,6 +528,7 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
> vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
> vms->unmap_start = FIRST_USER_ADDRESS;
> vms->unmap_end = USER_PGTABLES_CEILING;
> + vms->clear_ptes = false; /* No PTEs to clear yet */
> }
>
> /*
> @@ -732,7 +737,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
> vma_iter_store(vmi, vma);
>
> vma_complete(&vp, vmi, vma->vm_mm);
> - validate_mm(vma->vm_mm);
> return 0;
>
> nomem:
> @@ -2606,11 +2610,14 @@ static inline void abort_munmap_vmas(struct ma_state *mas_detach)
> }
>
>
> -static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
> +static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
> struct ma_state *mas_detach, bool mm_wr_locked)
> {
> struct mmu_gather tlb;
>
> + if (!vms->clear_ptes) /* Nothing to do */
> + return;
> +
> /*
> * We can free page tables without write-locking mmap_lock because VMAs
> * were isolated before we downgraded mmap_lock.
> @@ -2624,6 +2631,7 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
> /* start and end may be different if there is no prev or next vma. */
> free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked);
> tlb_finish_mmu(&tlb);
> + vms->clear_ptes = false;
> }
>
> /*
> @@ -2647,7 +2655,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
> if (vms->unlock)
> mmap_write_downgrade(mm);
>
> - vms_complete_pte_clear(vms, mas_detach, !vms->unlock);
> + vms_clear_ptes(vms, mas_detach, !vms->unlock);
> /* Update high watermark before we lower total_vm */
> update_hiwater_vm(mm);
> /* Stat accounting */
> @@ -2799,6 +2807,9 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
> while (vma_iter_addr(vms->vmi) > vms->start)
> vma_iter_prev_range(vms->vmi);
>
> + /* There are now PTEs that need to be cleared */
> + vms->clear_ptes = true;
> +
> return 0;
>
> userfaultfd_error:
> @@ -2807,6 +2818,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
> abort_munmap_vmas(mas_detach);
> start_split_failed:
> map_count_exceeded:
> + validate_mm(vms->mm);
I'm guessing here we know it's safe to validate?
> return error;
> }
>
> @@ -2851,8 +2863,8 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>
> clear_tree_failed:
> abort_munmap_vmas(&mas_detach);
> -gather_failed:
> validate_mm(mm);
Additionally I imagine the gathering failing results in the tree being unable to
be validated?
> +gather_failed:
> return error;
> }
>
> @@ -2940,24 +2952,19 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> unsigned long merge_start = addr, merge_end = end;
> bool writable_file_mapping = false;
> pgoff_t vm_pgoff;
> - int error;
> + int error = -ENOMEM;
> VMA_ITERATOR(vmi, mm, addr);
> + unsigned long nr_pages, nr_accounted;
>
> - /* Check against address space limit. */
> - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
> - unsigned long nr_pages;
> -
> - /*
> - * MAP_FIXED may remove pages of mappings that intersects with
> - * requested mapping. Account for the pages it would unmap.
> - */
> - nr_pages = count_vma_pages_range(mm, addr, end);
> -
> - if (!may_expand_vm(mm, vm_flags,
> - (len >> PAGE_SHIFT) - nr_pages))
> - return -ENOMEM;
> - }
> + nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted);
>
> + /*
> + * Check against address space limit.
> + * MAP_FIXED may remove pages of mappings that intersects with requested
> + * mapping. Account for the pages it would unmap.
> + */
> + if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages))
> + return -ENOMEM;
>
> if (unlikely(!can_modify_mm(mm, addr, end)))
> return -EPERM;
> @@ -2974,18 +2981,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> mas_init(&mas_detach, &mt_detach, /* addr = */ 0);
> /* Prepare to unmap any existing mapping in the area */
> if (vms_gather_munmap_vmas(&vms, &mas_detach))
> - goto gather_failed;
> -
> - /* Remove any existing mappings from the vma tree */
> - if (vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL))
> - goto clear_tree_failed;
> + return -ENOMEM;
>
> - /* Unmap any existing mapping in the area */
> - vms_complete_munmap_vmas(&vms, &mas_detach);
> next = vms.next;
> prev = vms.prev;
> vma = NULL;
> } else {
> + /* Minimal setup of vms */
Nit, but is this valid now we use the init function unconditionally?
> next = vma_next(&vmi);
> prev = vma_prev(&vmi);
> if (prev)
> @@ -2997,8 +2999,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> */
> if (accountable_mapping(file, vm_flags)) {
> charged = len >> PAGE_SHIFT;
> + charged -= nr_accounted;
> if (security_vm_enough_memory_mm(mm, charged))
> - return -ENOMEM;
> + goto abort_munmap;
> + vms.nr_accounted = 0;
> vm_flags |= VM_ACCOUNT;
> }
>
> @@ -3047,10 +3051,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> * not unmapped, but the maps are removed from the list.
> */
> vma = vm_area_alloc(mm);
> - if (!vma) {
> - error = -ENOMEM;
> + if (!vma)
> goto unacct_error;
> - }
>
> vma_iter_config(&vmi, addr, end);
> vma_set_range(vma, addr, end, pgoff);
> @@ -3059,6 +3061,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>
> if (file) {
> vma->vm_file = get_file(file);
> + /* call_mmap() may map PTE, so ensure there are no existing PTEs */
> + vms_clear_ptes(&vms, &mas_detach, true);
> error = call_mmap(file, vma);
> if (error)
> goto unmap_and_free_vma;
> @@ -3149,6 +3153,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> expanded:
> perf_event_mmap(vma);
>
> + /* Unmap any existing mapping in the area */
> + if (vms.nr_pages)
> + vms_complete_munmap_vmas(&vms, &mas_detach);
> +
> vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
> if (vm_flags & VM_LOCKED) {
> if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
> @@ -3196,14 +3204,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> unacct_error:
> if (charged)
> vm_unacct_memory(charged);
> - validate_mm(mm);
> - return error;
>
> -clear_tree_failed:
> - abort_munmap_vmas(&mas_detach);
> -gather_failed:
> +abort_munmap:
> + if (vms.nr_pages)
> + abort_munmap_vmas(&mas_detach);
> validate_mm(mm);
> - return -ENOMEM;
> + return error;
> }
>
> static int __vm_munmap(unsigned long start, size_t len, bool unlock)
> --
> 2.43.0
>
Other than nits/queries, LGTM:
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [240711 11:25]:
> On Wed, Jul 10, 2024 at 03:22:43PM GMT, Liam R. Howlett wrote:
> > From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
> >
> > Instead of zeroing the vma tree and then overwriting the area, let the
> > area be overwritten and then clean up the gathered vmas using
> > vms_complete_munmap_vmas().
> >
> > If a driver is mapping over an existing vma, then clear the ptes before
> > the call_mmap() invocation. This is done using the vms_clear_ptes()
> > helper.
> >
> > Temporarily keep track of the number of pages that will be removed and
> > reduce the charged amount.
> >
> > This also drops the validate_mm() call in the vma_expand() function.
> > It is necessary to drop the validate as it would fail since the mm
> > map_count would be incorrect during a vma expansion, prior to the
> > cleanup from vms_complete_munmap_vmas().
> >
> > Clean up the error handing of the vms_gather_munmap_vmas() by calling
> > the verification within the function.
> >
> > Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
> > ---
> > mm/internal.h | 1 +
> > mm/mmap.c | 80 +++++++++++++++++++++++++++------------------------
> > 2 files changed, 44 insertions(+), 37 deletions(-)
> >
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 11e90c6e5a3e..dd4eede1be0f 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -1503,6 +1503,7 @@ struct vma_munmap_struct {
> > unsigned long stack_vm;
> > unsigned long data_vm;
> > bool unlock; /* Unlock after the munmap */
> > + bool clear_ptes; /* If there are outstanding PTE to be cleared */
> > };
> >
> > void __meminit __init_single_page(struct page *page, unsigned long pfn,
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index 870c2d04ad6b..58cf42e22bfe 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -401,17 +401,21 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
> > }
> >
> > static unsigned long count_vma_pages_range(struct mm_struct *mm,
> > - unsigned long addr, unsigned long end)
> > + unsigned long addr, unsigned long end,
> > + unsigned long *nr_accounted)
> > {
> > VMA_ITERATOR(vmi, mm, addr);
> > struct vm_area_struct *vma;
> > unsigned long nr_pages = 0;
> >
> > + *nr_accounted = 0;
> > for_each_vma_range(vmi, vma, end) {
> > unsigned long vm_start = max(addr, vma->vm_start);
> > unsigned long vm_end = min(end, vma->vm_end);
> >
> > nr_pages += PHYS_PFN(vm_end - vm_start);
> > + if (vma->vm_flags & VM_ACCOUNT)
> > + *nr_accounted += PHYS_PFN(vm_end - vm_start);
> > }
> >
> > return nr_pages;
> > @@ -524,6 +528,7 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
> > vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
> > vms->unmap_start = FIRST_USER_ADDRESS;
> > vms->unmap_end = USER_PGTABLES_CEILING;
> > + vms->clear_ptes = false; /* No PTEs to clear yet */
> > }
> >
> > /*
> > @@ -732,7 +737,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
> > vma_iter_store(vmi, vma);
> >
> > vma_complete(&vp, vmi, vma->vm_mm);
> > - validate_mm(vma->vm_mm);
> > return 0;
> >
> > nomem:
> > @@ -2606,11 +2610,14 @@ static inline void abort_munmap_vmas(struct ma_state *mas_detach)
> > }
> >
> >
> > -static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
> > +static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
> > struct ma_state *mas_detach, bool mm_wr_locked)
> > {
> > struct mmu_gather tlb;
> >
> > + if (!vms->clear_ptes) /* Nothing to do */
> > + return;
> > +
> > /*
> > * We can free page tables without write-locking mmap_lock because VMAs
> > * were isolated before we downgraded mmap_lock.
> > @@ -2624,6 +2631,7 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
> > /* start and end may be different if there is no prev or next vma. */
> > free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked);
> > tlb_finish_mmu(&tlb);
> > + vms->clear_ptes = false;
> > }
> >
> > /*
> > @@ -2647,7 +2655,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
> > if (vms->unlock)
> > mmap_write_downgrade(mm);
> >
> > - vms_complete_pte_clear(vms, mas_detach, !vms->unlock);
> > + vms_clear_ptes(vms, mas_detach, !vms->unlock);
> > /* Update high watermark before we lower total_vm */
> > update_hiwater_vm(mm);
> > /* Stat accounting */
> > @@ -2799,6 +2807,9 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
> > while (vma_iter_addr(vms->vmi) > vms->start)
> > vma_iter_prev_range(vms->vmi);
> >
> > + /* There are now PTEs that need to be cleared */
> > + vms->clear_ptes = true;
> > +
> > return 0;
> >
> > userfaultfd_error:
> > @@ -2807,6 +2818,7 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
> > abort_munmap_vmas(mas_detach);
> > start_split_failed:
> > map_count_exceeded:
> > + validate_mm(vms->mm);
>
> I'm guessing here we know it's safe to validate?
verification in the gather state is always safe - we haven't changed the
tree or a vma yet.
>
> > return error;
> > }
> >
> > @@ -2851,8 +2863,8 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
> >
> > clear_tree_failed:
> > abort_munmap_vmas(&mas_detach);
> > -gather_failed:
> > validate_mm(mm);
>
> Additionally I imagine the gathering failing results in the tree being unable to
> be validated?
It is safe, but if it's here then it doesn't need to be above
>
> > +gather_failed:
> > return error;
> > }
> >
> > @@ -2940,24 +2952,19 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > unsigned long merge_start = addr, merge_end = end;
> > bool writable_file_mapping = false;
> > pgoff_t vm_pgoff;
> > - int error;
> > + int error = -ENOMEM;
> > VMA_ITERATOR(vmi, mm, addr);
> > + unsigned long nr_pages, nr_accounted;
> >
> > - /* Check against address space limit. */
> > - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
> > - unsigned long nr_pages;
> > -
> > - /*
> > - * MAP_FIXED may remove pages of mappings that intersects with
> > - * requested mapping. Account for the pages it would unmap.
> > - */
> > - nr_pages = count_vma_pages_range(mm, addr, end);
> > -
> > - if (!may_expand_vm(mm, vm_flags,
> > - (len >> PAGE_SHIFT) - nr_pages))
> > - return -ENOMEM;
> > - }
> > + nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted);
> >
> > + /*
> > + * Check against address space limit.
> > + * MAP_FIXED may remove pages of mappings that intersects with requested
> > + * mapping. Account for the pages it would unmap.
> > + */
> > + if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages))
> > + return -ENOMEM;
> >
> > if (unlikely(!can_modify_mm(mm, addr, end)))
> > return -EPERM;
> > @@ -2974,18 +2981,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > mas_init(&mas_detach, &mt_detach, /* addr = */ 0);
> > /* Prepare to unmap any existing mapping in the area */
> > if (vms_gather_munmap_vmas(&vms, &mas_detach))
> > - goto gather_failed;
> > -
> > - /* Remove any existing mappings from the vma tree */
> > - if (vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL))
> > - goto clear_tree_failed;
> > + return -ENOMEM;
> >
> > - /* Unmap any existing mapping in the area */
> > - vms_complete_munmap_vmas(&vms, &mas_detach);
> > next = vms.next;
> > prev = vms.prev;
> > vma = NULL;
> > } else {
> > + /* Minimal setup of vms */
>
> Nit, but is this valid now we use the init function unconditionally?
Yes, that needs to be dropped, thanks.
>
> > next = vma_next(&vmi);
> > prev = vma_prev(&vmi);
> > if (prev)
> > @@ -2997,8 +2999,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > */
> > if (accountable_mapping(file, vm_flags)) {
> > charged = len >> PAGE_SHIFT;
> > + charged -= nr_accounted;
> > if (security_vm_enough_memory_mm(mm, charged))
> > - return -ENOMEM;
> > + goto abort_munmap;
> > + vms.nr_accounted = 0;
> > vm_flags |= VM_ACCOUNT;
> > }
> >
> > @@ -3047,10 +3051,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > * not unmapped, but the maps are removed from the list.
> > */
> > vma = vm_area_alloc(mm);
> > - if (!vma) {
> > - error = -ENOMEM;
> > + if (!vma)
> > goto unacct_error;
> > - }
> >
> > vma_iter_config(&vmi, addr, end);
> > vma_set_range(vma, addr, end, pgoff);
> > @@ -3059,6 +3061,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> >
> > if (file) {
> > vma->vm_file = get_file(file);
> > + /* call_mmap() may map PTE, so ensure there are no existing PTEs */
> > + vms_clear_ptes(&vms, &mas_detach, true);
> > error = call_mmap(file, vma);
> > if (error)
> > goto unmap_and_free_vma;
> > @@ -3149,6 +3153,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > expanded:
> > perf_event_mmap(vma);
> >
> > + /* Unmap any existing mapping in the area */
> > + if (vms.nr_pages)
> > + vms_complete_munmap_vmas(&vms, &mas_detach);
> > +
> > vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
> > if (vm_flags & VM_LOCKED) {
> > if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
> > @@ -3196,14 +3204,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> > unacct_error:
> > if (charged)
> > vm_unacct_memory(charged);
> > - validate_mm(mm);
> > - return error;
> >
> > -clear_tree_failed:
> > - abort_munmap_vmas(&mas_detach);
> > -gather_failed:
> > +abort_munmap:
> > + if (vms.nr_pages)
> > + abort_munmap_vmas(&mas_detach);
> > validate_mm(mm);
> > - return -ENOMEM;
> > + return error;
> > }
> >
> > static int __vm_munmap(unsigned long start, size_t len, bool unlock)
> > --
> > 2.43.0
> >
>
> Other than nits/queries, LGTM:
>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
© 2016 - 2025 Red Hat, Inc.