Doing so means we can get rid of all the weird struct vm_area_struct **prev
stuff, everything becomes consistent and in future if we want to make
change to behaviour there's a single place where all relevant state is
stored.
This also allows us to update try_vma_read_lock() to be a little more
succinct and set up state for us, as well as cleaning up
madvise_update_vma().
We also update the debug assertion prior to madvise_update_vma() to assert
that this is a write operation as correctly pointed out by Barry in the
relevant thread.
We can't reasonably update the madvise functions that live outside of
mm/madvise.c so we leave those as-is.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
mm/madvise.c | 283 ++++++++++++++++++++++++++-------------------------
1 file changed, 146 insertions(+), 137 deletions(-)
diff --git a/mm/madvise.c b/mm/madvise.c
index 6faa38b92111..86fe04aa7c88 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -74,6 +74,8 @@ struct madvise_behavior {
* traversing multiple VMAs, this is updated for each.
*/
struct madvise_behavior_range range;
+ /* The VMA and VMA preceding it (if applicable) currently targeted. */
+ struct vm_area_struct *prev, *vma;
};
#ifdef CONFIG_ANON_VMA_NAME
@@ -194,26 +196,27 @@ static bool is_anon_vma_name(int behavior)
* Caller should ensure anon_name stability by raising its refcount even when
* anon_name belongs to a valid vma because this function might free that vma.
*/
-static int madvise_update_vma(struct vm_area_struct *vma,
- struct vm_area_struct **prev, unsigned long start,
- unsigned long end, vm_flags_t new_flags,
- struct anon_vma_name *anon_name)
+static int madvise_update_vma(vm_flags_t new_flags,
+ struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
int error;
- VMA_ITERATOR(vmi, mm, start);
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+ struct anon_vma_name *anon_name = madv_behavior->anon_name;
+ VMA_ITERATOR(vmi, madv_behavior->mm, range->start);
if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
- *prev = vma;
+ madv_behavior->prev = vma;
return 0;
}
- vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
- anon_name);
+ vma = vma_modify_flags_name(&vmi, madv_behavior->prev, vma,
+ range->start, range->end, new_flags, anon_name);
if (IS_ERR(vma))
return PTR_ERR(vma);
- *prev = vma;
+ madv_behavior->vma = vma;
+ madv_behavior->prev = vma;
/* vm_flags is protected by the mmap_lock held in write mode. */
vma_start_write(vma);
@@ -318,15 +321,16 @@ static void shmem_swapin_range(struct vm_area_struct *vma,
/*
* Schedule all required I/O operations. Do not wait for completion.
*/
-static long madvise_willneed(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_willneed(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct mm_struct *mm = madv_behavior->mm;
struct file *file = vma->vm_file;
+ unsigned long start = madv_behavior->range.start;
+ unsigned long end = madv_behavior->range.end;
loff_t offset;
- *prev = vma;
+ madv_behavior->prev = vma;
#ifdef CONFIG_SWAP
if (!file) {
walk_page_range_vma(vma, start, end, &swapin_walk_ops, vma);
@@ -355,7 +359,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
* vma's reference to the file) can go away as soon as we drop
* mmap_lock.
*/
- *prev = NULL; /* tell sys_madvise we drop mmap_lock */
+ madv_behavior->prev = NULL; /* tell sys_madvise we drop mmap_lock */
get_file(file);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
@@ -620,16 +624,19 @@ static const struct mm_walk_ops cold_walk_ops = {
};
static void madvise_cold_page_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+ struct madvise_behavior *madv_behavior)
+
{
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
struct madvise_walk_private walk_private = {
.pageout = false,
.tlb = tlb,
};
tlb_start_vma(tlb, vma);
- walk_page_range_vma(vma, addr, end, &cold_walk_ops, &walk_private);
+ walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops,
+ &walk_private);
tlb_end_vma(tlb, vma);
}
@@ -638,28 +645,26 @@ static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
}
-static long madvise_cold(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start_addr, unsigned long end_addr)
+static long madvise_cold(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *vma = madv_behavior->vma;
struct mmu_gather tlb;
- *prev = vma;
+ madv_behavior->prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
- madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_gather_mmu(&tlb, madv_behavior->mm);
+ madvise_cold_page_range(&tlb, madv_behavior);
tlb_finish_mmu(&tlb);
return 0;
}
static void madvise_pageout_page_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+ struct vm_area_struct *vma,
+ struct madvise_behavior_range *range)
{
struct madvise_walk_private walk_private = {
.pageout = true,
@@ -667,18 +672,17 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb,
};
tlb_start_vma(tlb, vma);
- walk_page_range_vma(vma, addr, end, &cold_walk_ops, &walk_private);
+ walk_page_range_vma(vma, range->start, range->end, &cold_walk_ops,
+ &walk_private);
tlb_end_vma(tlb, vma);
}
-static long madvise_pageout(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start_addr, unsigned long end_addr)
+static long madvise_pageout(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
struct mmu_gather tlb;
+ struct vm_area_struct *vma = madv_behavior->vma;
- *prev = vma;
+ madv_behavior->prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
@@ -693,8 +697,8 @@ static long madvise_pageout(struct vm_area_struct *vma,
return 0;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
- madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_gather_mmu(&tlb, madv_behavior->mm);
+ madvise_pageout_page_range(&tlb, vma, &madv_behavior->range);
tlb_finish_mmu(&tlb);
return 0;
@@ -857,11 +861,12 @@ static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode)
}
}
-static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
- struct vm_area_struct *vma,
- unsigned long start_addr, unsigned long end_addr)
+static int madvise_free_single_vma(struct madvise_behavior *madv_behavior)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mm_struct *mm = madv_behavior->mm;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ unsigned long start_addr = madv_behavior->range.start;
+ unsigned long end_addr = madv_behavior->range.end;
struct mmu_notifier_range range;
struct mmu_gather *tlb = madv_behavior->tlb;
struct mm_walk_ops walk_ops = {
@@ -913,25 +918,28 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
* An interface that causes the system to free clean pages and flush
* dirty pages is already available as msync(MS_INVALIDATE).
*/
-static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior)
+
{
+ struct madvise_behavior_range *range = &madv_behavior->range;
struct zap_details details = {
.reclaim_pt = true,
.even_cows = true,
};
zap_page_range_single_batched(
- madv_behavior->tlb, vma, start, end - start, &details);
+ madv_behavior->tlb, madv_behavior->vma, range->start,
+ range->end - range->start, &details);
return 0;
}
-static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
- unsigned long start,
- unsigned long *end,
- int behavior)
+static
+bool madvise_dontneed_free_valid_vma(struct madvise_behavior *madv_behavior)
{
+ struct vm_area_struct *vma = madv_behavior->vma;
+ int behavior = madv_behavior->behavior;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+
if (!is_vm_hugetlb_page(vma)) {
unsigned int forbidden = VM_PFNMAP;
@@ -943,7 +951,7 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
return false;
- if (start & ~huge_page_mask(hstate_vma(vma)))
+ if (range->start & ~huge_page_mask(hstate_vma(vma)))
return false;
/*
@@ -952,41 +960,40 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
* Avoid unexpected data loss by rounding down the number of
* huge pages freed.
*/
- *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
+ range->end = ALIGN_DOWN(range->end, huge_page_size(hstate_vma(vma)));
return true;
}
-static long madvise_dontneed_free(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end,
- struct madvise_behavior *madv_behavior)
+static long madvise_dontneed_free(struct madvise_behavior *madv_behavior)
{
+ struct mm_struct *mm = madv_behavior->mm;
+ struct madvise_behavior_range *range = &madv_behavior->range;
int behavior = madv_behavior->behavior;
- struct mm_struct *mm = vma->vm_mm;
- *prev = vma;
- if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
+ madv_behavior->prev = madv_behavior->vma;
+ if (!madvise_dontneed_free_valid_vma(madv_behavior))
return -EINVAL;
- if (start == end)
+ if (range->start == range->end)
return 0;
- if (!userfaultfd_remove(vma, start, end)) {
- *prev = NULL; /* mmap_lock has been dropped, prev is stale */
+ if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) {
+ struct vm_area_struct *vma;
+
+ madv_behavior->prev = NULL; /* mmap_lock has been dropped, prev is stale */
mmap_read_lock(mm);
- vma = vma_lookup(mm, start);
+ madv_behavior->vma = vma = vma_lookup(mm, range->start);
if (!vma)
return -ENOMEM;
/*
* Potential end adjustment for hugetlb vma is OK as
* the check below keeps end within vma.
*/
- if (!madvise_dontneed_free_valid_vma(vma, start, &end,
- behavior))
+ if (!madvise_dontneed_free_valid_vma(madv_behavior))
return -EINVAL;
- if (end > vma->vm_end) {
+ if (range->end > vma->vm_end) {
/*
* Don't fail if end > vma->vm_end. If the old
* vma was split while the mmap_lock was
@@ -999,7 +1006,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
* end-vma->vm_end range, but the manager can
* handle a repetition fine.
*/
- end = vma->vm_end;
+ range->end = vma->vm_end;
}
/*
* If the memory region between start and end was
@@ -1008,16 +1015,15 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
* the adjustment for hugetlb vma above may have rounded
* end down to the start address.
*/
- if (start == end)
+ if (range->start == range->end)
return 0;
- VM_WARN_ON(start > end);
+ VM_WARN_ON(range->start > range->end);
}
if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
- return madvise_dontneed_single_vma(
- madv_behavior, vma, start, end);
+ return madvise_dontneed_single_vma(madv_behavior);
else if (behavior == MADV_FREE)
- return madvise_free_single_vma(madv_behavior, vma, start, end);
+ return madvise_free_single_vma(madv_behavior);
else
return -EINVAL;
}
@@ -1065,16 +1071,17 @@ static long madvise_populate(struct madvise_behavior *madv_behavior)
* Application wants to free up the pages and associated backing store.
* This is effectively punching a hole into the middle of a file.
*/
-static long madvise_remove(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_remove(struct madvise_behavior *madv_behavior)
{
loff_t offset;
int error;
struct file *f;
- struct mm_struct *mm = vma->vm_mm;
+ struct mm_struct *mm = madv_behavior->mm;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ unsigned long start = madv_behavior->range.start;
+ unsigned long end = madv_behavior->range.end;
- *prev = NULL; /* tell sys_madvise we drop mmap_lock */
+ madv_behavior->prev = NULL; /* tell sys_madvise we drop mmap_lock */
if (vma->vm_flags & VM_LOCKED)
return -EINVAL;
@@ -1186,14 +1193,14 @@ static const struct mm_walk_ops guard_install_walk_ops = {
.walk_lock = PGWALK_RDLOCK,
};
-static long madvise_guard_install(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_guard_install(struct madvise_behavior *madv_behavior)
{
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
long err;
int i;
- *prev = vma;
+ madv_behavior->prev = vma;
if (!is_valid_guard_vma(vma, /* allow_locked = */false))
return -EINVAL;
@@ -1224,13 +1231,14 @@ static long madvise_guard_install(struct vm_area_struct *vma,
unsigned long nr_pages = 0;
/* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
- err = walk_page_range_mm(vma->vm_mm, start, end,
+ err = walk_page_range_mm(vma->vm_mm, range->start, range->end,
&guard_install_walk_ops, &nr_pages);
if (err < 0)
return err;
if (err == 0) {
- unsigned long nr_expected_pages = PHYS_PFN(end - start);
+ unsigned long nr_expected_pages =
+ PHYS_PFN(range->end - range->start);
VM_WARN_ON(nr_pages != nr_expected_pages);
return 0;
@@ -1240,7 +1248,8 @@ static long madvise_guard_install(struct vm_area_struct *vma,
* OK some of the range have non-guard pages mapped, zap
* them. This leaves existing guard pages in place.
*/
- zap_page_range_single(vma, start, end - start, NULL);
+ zap_page_range_single(vma, range->start,
+ range->end - range->start, NULL);
}
/*
@@ -1296,11 +1305,12 @@ static const struct mm_walk_ops guard_remove_walk_ops = {
.walk_lock = PGWALK_RDLOCK,
};
-static long madvise_guard_remove(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
{
- *prev = vma;
+ struct vm_area_struct *vma = madv_behavior->vma;
+ struct madvise_behavior_range *range = &madv_behavior->range;
+
+ madv_behavior->prev = vma;
/*
* We're ok with removing guards in mlock()'d ranges, as this is a
* non-destructive action.
@@ -1308,7 +1318,7 @@ static long madvise_guard_remove(struct vm_area_struct *vma,
if (!is_valid_guard_vma(vma, /* allow_locked = */true))
return -EINVAL;
- return walk_page_range_vma(vma, start, end,
+ return walk_page_range_vma(vma, range->start, range->end,
&guard_remove_walk_ops, NULL);
}
@@ -1317,40 +1327,37 @@ static long madvise_guard_remove(struct vm_area_struct *vma,
* will handle splitting a vm area into separate areas, each area with its own
* behavior.
*/
-static int madvise_vma_behavior(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- struct madvise_behavior *madv_behavior)
+static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
{
int behavior = madv_behavior->behavior;
- struct anon_vma_name *anon_name = madv_behavior->anon_name;
+ struct vm_area_struct *vma = madv_behavior->vma;
vm_flags_t new_flags = vma->vm_flags;
- unsigned long start = madv_behavior->range.start;
- unsigned long end = madv_behavior->range.end;
+ struct madvise_behavior_range *range = &madv_behavior->range;
int error;
- if (unlikely(!can_modify_vma_madv(vma, behavior)))
+ if (unlikely(!can_modify_vma_madv(madv_behavior->vma, behavior)))
return -EPERM;
switch (behavior) {
case MADV_REMOVE:
- return madvise_remove(vma, prev, start, end);
+ return madvise_remove(madv_behavior);
case MADV_WILLNEED:
- return madvise_willneed(vma, prev, start, end);
+ return madvise_willneed(madv_behavior);
case MADV_COLD:
- return madvise_cold(vma, prev, start, end);
+ return madvise_cold(madv_behavior);
case MADV_PAGEOUT:
- return madvise_pageout(vma, prev, start, end);
+ return madvise_pageout(madv_behavior);
case MADV_FREE:
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
- return madvise_dontneed_free(vma, prev, start, end,
- madv_behavior);
+ return madvise_dontneed_free(madv_behavior);
case MADV_COLLAPSE:
- return madvise_collapse(vma, prev, start, end);
+ return madvise_collapse(vma, &madv_behavior->prev,
+ range->start, range->end);
case MADV_GUARD_INSTALL:
- return madvise_guard_install(vma, prev, start, end);
+ return madvise_guard_install(madv_behavior);
case MADV_GUARD_REMOVE:
- return madvise_guard_remove(vma, prev, start, end);
+ return madvise_guard_remove(madv_behavior);
/* The below behaviours update VMAs via madvise_update_vma(). */
@@ -1367,18 +1374,18 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
new_flags |= VM_DONTCOPY;
break;
case MADV_DOFORK:
- if (vma->vm_flags & VM_IO)
+ if (new_flags & VM_IO)
return -EINVAL;
new_flags &= ~VM_DONTCOPY;
break;
case MADV_WIPEONFORK:
/* MADV_WIPEONFORK is only supported on anonymous memory. */
- if (vma->vm_file || vma->vm_flags & VM_SHARED)
+ if (vma->vm_file || new_flags & VM_SHARED)
return -EINVAL;
new_flags |= VM_WIPEONFORK;
break;
case MADV_KEEPONFORK:
- if (vma->vm_flags & VM_DROPPABLE)
+ if (new_flags & VM_DROPPABLE)
return -EINVAL;
new_flags &= ~VM_WIPEONFORK;
break;
@@ -1386,14 +1393,15 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
new_flags |= VM_DONTDUMP;
break;
case MADV_DODUMP:
- if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) ||
- (vma->vm_flags & VM_DROPPABLE))
+ if ((!is_vm_hugetlb_page(vma) && (new_flags & VM_SPECIAL)) ||
+ (new_flags & VM_DROPPABLE))
return -EINVAL;
new_flags &= ~VM_DONTDUMP;
break;
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
- error = ksm_madvise(vma, start, end, behavior, &new_flags);
+ error = ksm_madvise(vma, range->start, range->end,
+ behavior, &new_flags);
if (error)
goto out;
break;
@@ -1411,18 +1419,16 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
break;
}
- /* We cannot provide prev in this lock mode. */
- VM_WARN_ON_ONCE(madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK);
+ /* This is a write operation.*/
+ VM_WARN_ON_ONCE(madv_behavior->lock_mode != MADVISE_MMAP_WRITE_LOCK);
if (!is_anon_vma_name(behavior)) {
- anon_name = anon_vma_name(vma);
- anon_vma_name_get(anon_name);
+ madv_behavior->anon_name = anon_vma_name(vma);
+ anon_vma_name_get(madv_behavior->anon_name);
}
- error = madvise_update_vma(vma, prev, start, end, new_flags,
- anon_name);
+ error = madvise_update_vma(new_flags, madv_behavior);
if (!is_anon_vma_name(behavior))
- anon_vma_name_put(anon_name);
-
+ anon_vma_name_put(madv_behavior->anon_name);
out:
/*
* madvise() returns EAGAIN if kernel resources, such as
@@ -1572,13 +1578,13 @@ static bool process_madvise_remote_valid(int behavior)
* span either partially or fully.
*
* This function always returns with an appropriate lock held. If a VMA read
- * lock could be acquired, we return the locked VMA.
+ * lock could be acquired, we return true and set madv_behavior state
+ * accordingly.
*
- * If a VMA read lock could not be acquired, we return NULL and expect caller to
+ * If a VMA read lock could not be acquired, we return false and expect caller to
* fallback to mmap lock behaviour.
*/
-static
-struct vm_area_struct *try_vma_read_lock(struct madvise_behavior *madv_behavior)
+static bool try_vma_read_lock(struct madvise_behavior *madv_behavior)
{
struct mm_struct *mm = madv_behavior->mm;
struct vm_area_struct *vma;
@@ -1595,12 +1601,14 @@ struct vm_area_struct *try_vma_read_lock(struct madvise_behavior *madv_behavior)
vma_end_read(vma);
goto take_mmap_read_lock;
}
- return vma;
+ madv_behavior->prev = vma; /* Not currently required. */
+ madv_behavior->vma = vma;
+ return true;
take_mmap_read_lock:
mmap_read_lock(mm);
madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK;
- return NULL;
+ return false;
}
/*
@@ -1617,23 +1625,19 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior)
struct mm_struct *mm = madv_behavior->mm;
struct madvise_behavior_range *range = &madv_behavior->range;
unsigned long end = range->end;
- struct vm_area_struct *vma;
- struct vm_area_struct *prev;
int unmapped_error = 0;
int error;
+ struct vm_area_struct *vma;
/*
* If VMA read lock is supported, apply madvise to a single VMA
* tentatively, avoiding walking VMAs.
*/
- if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) {
- vma = try_vma_read_lock(madv_behavior);
- if (vma) {
- prev = vma;
- error = madvise_vma_behavior(vma, &prev, madv_behavior);
- vma_end_read(vma);
- return error;
- }
+ if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK &&
+ try_vma_read_lock(madv_behavior)) {
+ error = madvise_vma_behavior(madv_behavior);
+ vma_end_read(madv_behavior->vma);
+ return error;
}
/*
@@ -1641,11 +1645,13 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior)
* ranges, just ignore them, but return -ENOMEM at the end.
* - different from the way of handling in mlock etc.
*/
- vma = find_vma_prev(mm, range->start, &prev);
+ vma = find_vma_prev(mm, range->start, &madv_behavior->prev);
if (vma && range->start > vma->vm_start)
- prev = vma;
+ madv_behavior->prev = vma;
for (;;) {
+ struct vm_area_struct *prev;
+
/* Still start < end. */
if (!vma)
return -ENOMEM;
@@ -1662,13 +1668,16 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior)
range->end = min(vma->vm_end, end);
/* Here vma->vm_start <= range->start < range->end <= (end|vma->vm_end). */
- error = madvise_vma_behavior(vma, &prev, madv_behavior);
+ madv_behavior->vma = vma;
+ error = madvise_vma_behavior(madv_behavior);
if (error)
return error;
+ prev = madv_behavior->prev;
+
range->start = range->end;
if (prev && range->start < prev->vm_end)
range->start = prev->vm_end;
- if (range->start >= range->end)
+ if (range->start >= end)
break;
if (prev)
vma = find_vma(mm, prev->vm_end);
--
2.49.0
On 6/19/25 22:26, Lorenzo Stoakes wrote: > Doing so means we can get rid of all the weird struct vm_area_struct **prev > stuff, everything becomes consistent and in future if we want to make > change to behaviour there's a single place where all relevant state is > stored. > > This also allows us to update try_vma_read_lock() to be a little more > succinct and set up state for us, as well as cleaning up > madvise_update_vma(). > > We also update the debug assertion prior to madvise_update_vma() to assert > that this is a write operation as correctly pointed out by Barry in the > relevant thread. > > We can't reasonably update the madvise functions that live outside of > mm/madvise.c so we leave those as-is. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> The prev manipulation is indeed confusing, looking forward to the next patch... Nits: > --- > mm/madvise.c | 283 ++++++++++++++++++++++++++------------------------- > 1 file changed, 146 insertions(+), 137 deletions(-) > > diff --git a/mm/madvise.c b/mm/madvise.c > index 6faa38b92111..86fe04aa7c88 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -74,6 +74,8 @@ struct madvise_behavior { > * traversing multiple VMAs, this is updated for each. > */ > struct madvise_behavior_range range; > + /* The VMA and VMA preceding it (if applicable) currently targeted. */ > + struct vm_area_struct *prev, *vma; Would also do separate lines here. > -static long madvise_dontneed_free(struct vm_area_struct *vma, > - struct vm_area_struct **prev, > - unsigned long start, unsigned long end, > - struct madvise_behavior *madv_behavior) > +static long madvise_dontneed_free(struct madvise_behavior *madv_behavior) > { > + struct mm_struct *mm = madv_behavior->mm; > + struct madvise_behavior_range *range = &madv_behavior->range; > int behavior = madv_behavior->behavior; > - struct mm_struct *mm = vma->vm_mm; > > - *prev = vma; > - if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) > + madv_behavior->prev = madv_behavior->vma; > + if (!madvise_dontneed_free_valid_vma(madv_behavior)) > return -EINVAL; > > - if (start == end) > + if (range->start == range->end) > return 0; > > - if (!userfaultfd_remove(vma, start, end)) { > - *prev = NULL; /* mmap_lock has been dropped, prev is stale */ > + if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) { > + struct vm_area_struct *vma; > + > + madv_behavior->prev = NULL; /* mmap_lock has been dropped, prev is stale */ > > mmap_read_lock(mm); > - vma = vma_lookup(mm, start); > + madv_behavior->vma = vma = vma_lookup(mm, range->start); This replaces vma in madv_behavior... > @@ -1617,23 +1625,19 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior) > struct mm_struct *mm = madv_behavior->mm; > struct madvise_behavior_range *range = &madv_behavior->range; > unsigned long end = range->end; > - struct vm_area_struct *vma; > - struct vm_area_struct *prev; > int unmapped_error = 0; > int error; > + struct vm_area_struct *vma; > > /* > * If VMA read lock is supported, apply madvise to a single VMA > * tentatively, avoiding walking VMAs. > */ > - if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) { > - vma = try_vma_read_lock(madv_behavior); > - if (vma) { > - prev = vma; > - error = madvise_vma_behavior(vma, &prev, madv_behavior); > - vma_end_read(vma); > - return error; > - } > + if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK && > + try_vma_read_lock(madv_behavior)) { > + error = madvise_vma_behavior(madv_behavior); > + vma_end_read(madv_behavior->vma); > + return error; And here we could potentially do vma_end_read() on the replaced vma. And it's exactly cases using madvise_dontneed_free() that use MADVISE_VMA_READ_LOCK mode. But it's not an issue as try_vma_read_lock() will fail with uffd and that vma replacement scenario is tied to userfaultfd_remove(). It's just quite tricky, hm... > } > > /* > @@ -1641,11 +1645,13 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior) > * ranges, just ignore them, but return -ENOMEM at the end. > * - different from the way of handling in mlock etc. > */ > - vma = find_vma_prev(mm, range->start, &prev); > + vma = find_vma_prev(mm, range->start, &madv_behavior->prev); > if (vma && range->start > vma->vm_start) > - prev = vma; > + madv_behavior->prev = vma; > > for (;;) { > + struct vm_area_struct *prev; > + > /* Still start < end. */ > if (!vma) > return -ENOMEM; > @@ -1662,13 +1668,16 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior) > range->end = min(vma->vm_end, end); > > /* Here vma->vm_start <= range->start < range->end <= (end|vma->vm_end). */ > - error = madvise_vma_behavior(vma, &prev, madv_behavior); > + madv_behavior->vma = vma; > + error = madvise_vma_behavior(madv_behavior); > if (error) > return error; > + prev = madv_behavior->prev; > + > range->start = range->end; > if (prev && range->start < prev->vm_end) > range->start = prev->vm_end; > - if (range->start >= range->end) > + if (range->start >= end) > break; > if (prev) > vma = find_vma(mm, prev->vm_end);
On Fri, Jun 20, 2025 at 04:32:48PM +0200, Vlastimil Babka wrote: > On 6/19/25 22:26, Lorenzo Stoakes wrote: > > Doing so means we can get rid of all the weird struct vm_area_struct **prev > > stuff, everything becomes consistent and in future if we want to make > > change to behaviour there's a single place where all relevant state is > > stored. > > > > This also allows us to update try_vma_read_lock() to be a little more > > succinct and set up state for us, as well as cleaning up > > madvise_update_vma(). > > > > We also update the debug assertion prior to madvise_update_vma() to assert > > that this is a write operation as correctly pointed out by Barry in the > > relevant thread. > > > > We can't reasonably update the madvise functions that live outside of > > mm/madvise.c so we leave those as-is. > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Thanks! > > The prev manipulation is indeed confusing, looking forward to the next patch... Yeah this was the whole motivation for the series, ultimately... > > Nits: > > > --- > > mm/madvise.c | 283 ++++++++++++++++++++++++++------------------------- > > 1 file changed, 146 insertions(+), 137 deletions(-) > > > > diff --git a/mm/madvise.c b/mm/madvise.c > > index 6faa38b92111..86fe04aa7c88 100644 > > --- a/mm/madvise.c > > +++ b/mm/madvise.c > > @@ -74,6 +74,8 @@ struct madvise_behavior { > > * traversing multiple VMAs, this is updated for each. > > */ > > struct madvise_behavior_range range; > > + /* The VMA and VMA preceding it (if applicable) currently targeted. */ > > + struct vm_area_struct *prev, *vma; > > Would also do separate lines here. Ack will do. > > > -static long madvise_dontneed_free(struct vm_area_struct *vma, > > - struct vm_area_struct **prev, > > - unsigned long start, unsigned long end, > > - struct madvise_behavior *madv_behavior) > > +static long madvise_dontneed_free(struct madvise_behavior *madv_behavior) > > { > > + struct mm_struct *mm = madv_behavior->mm; > > + struct madvise_behavior_range *range = &madv_behavior->range; > > int behavior = madv_behavior->behavior; > > - struct mm_struct *mm = vma->vm_mm; > > > > - *prev = vma; > > - if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) > > + madv_behavior->prev = madv_behavior->vma; > > + if (!madvise_dontneed_free_valid_vma(madv_behavior)) > > return -EINVAL; > > > > - if (start == end) > > + if (range->start == range->end) > > return 0; > > > > - if (!userfaultfd_remove(vma, start, end)) { > > - *prev = NULL; /* mmap_lock has been dropped, prev is stale */ > > + if (!userfaultfd_remove(madv_behavior->vma, range->start, range->end)) { > > + struct vm_area_struct *vma; > > + > > + madv_behavior->prev = NULL; /* mmap_lock has been dropped, prev is stale */ > > > > mmap_read_lock(mm); > > - vma = vma_lookup(mm, start); > > + madv_behavior->vma = vma = vma_lookup(mm, range->start); > > This replaces vma in madv_behavior... Yeah note that the lock is dropped here also :) so the previous VMA is completely invalidated, so this is valid. > > > @@ -1617,23 +1625,19 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior) > > struct mm_struct *mm = madv_behavior->mm; > > struct madvise_behavior_range *range = &madv_behavior->range; > > unsigned long end = range->end; > > - struct vm_area_struct *vma; > > - struct vm_area_struct *prev; > > int unmapped_error = 0; > > int error; > > + struct vm_area_struct *vma; > > > > /* > > * If VMA read lock is supported, apply madvise to a single VMA > > * tentatively, avoiding walking VMAs. > > */ > > - if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) { > > - vma = try_vma_read_lock(madv_behavior); > > - if (vma) { > > - prev = vma; > > - error = madvise_vma_behavior(vma, &prev, madv_behavior); > > - vma_end_read(vma); > > - return error; > > - } > > + if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK && > > + try_vma_read_lock(madv_behavior)) { > > + error = madvise_vma_behavior(madv_behavior); > > + vma_end_read(madv_behavior->vma); > > + return error; > > And here we could potentially do vma_end_read() on the replaced vma. And > it's exactly cases using madvise_dontneed_free() that use > MADVISE_VMA_READ_LOCK mode. But it's not an issue as try_vma_read_lock() > will fail with uffd and that vma replacement scenario is tied to > userfaultfd_remove(). It's just quite tricky, hm... Yeah this kind of thing is explicitly why Barry excluded uffd replace from the per-VMA lock I think. So we absolutely can't use the per-VMA lock in conjunction with uffd_remove(). I'll add a debug assert in madvise_dontneed_free() to make this clearer. Will add in madvise_remove() too as that does the same thing there, just to make sure we have assert coverage. > > > } > > > > /* > > @@ -1641,11 +1645,13 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior) > > * ranges, just ignore them, but return -ENOMEM at the end. > > * - different from the way of handling in mlock etc. > > */ > > - vma = find_vma_prev(mm, range->start, &prev); > > + vma = find_vma_prev(mm, range->start, &madv_behavior->prev); > > if (vma && range->start > vma->vm_start) > > - prev = vma; > > + madv_behavior->prev = vma; > > > > for (;;) { > > + struct vm_area_struct *prev; > > + > > /* Still start < end. */ > > if (!vma) > > return -ENOMEM; > > @@ -1662,13 +1668,16 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior) > > range->end = min(vma->vm_end, end); > > > > /* Here vma->vm_start <= range->start < range->end <= (end|vma->vm_end). */ > > - error = madvise_vma_behavior(vma, &prev, madv_behavior); > > + madv_behavior->vma = vma; > > + error = madvise_vma_behavior(madv_behavior); > > if (error) > > return error; > > + prev = madv_behavior->prev; > > + > > range->start = range->end; > > if (prev && range->start < prev->vm_end) > > range->start = prev->vm_end; > > - if (range->start >= range->end) > > + if (range->start >= end) > > break; > > if (prev) > > vma = find_vma(mm, prev->vm_end); >
On 6/19/25 22:26, Lorenzo Stoakes wrote: > @@ -1662,13 +1668,16 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior) > range->end = min(vma->vm_end, end); > > /* Here vma->vm_start <= range->start < range->end <= (end|vma->vm_end). */ > - error = madvise_vma_behavior(vma, &prev, madv_behavior); > + madv_behavior->vma = vma; > + error = madvise_vma_behavior(madv_behavior); > if (error) > return error; > + prev = madv_behavior->prev; > + > range->start = range->end; > if (prev && range->start < prev->vm_end) > range->start = prev->vm_end; > - if (range->start >= range->end) > + if (range->start >= end) I believe this change is fixing a bug from patch 3/5 (which I didn't catch, sigh). > break; > if (prev) > vma = find_vma(mm, prev->vm_end);
On Fri, Jun 20, 2025 at 04:16:07PM +0200, Vlastimil Babka wrote: > On 6/19/25 22:26, Lorenzo Stoakes wrote: > > @@ -1662,13 +1668,16 @@ int madvise_walk_vmas(struct madvise_behavior *madv_behavior) > > range->end = min(vma->vm_end, end); > > > > /* Here vma->vm_start <= range->start < range->end <= (end|vma->vm_end). */ > > - error = madvise_vma_behavior(vma, &prev, madv_behavior); > > + madv_behavior->vma = vma; > > + error = madvise_vma_behavior(madv_behavior); > > if (error) > > return error; > > + prev = madv_behavior->prev; > > + > > range->start = range->end; > > if (prev && range->start < prev->vm_end) > > range->start = prev->vm_end; > > - if (range->start >= range->end) > > + if (range->start >= end) > > I believe this change is fixing a bug from patch 3/5 (which I didn't catch, > sigh). Whoops, I fixed this during development but clearly didn't backport it to the correct commit, will fix on respin. > > > break; > > if (prev) > > vma = find_vma(mm, prev->vm_end); >
On 19 Jun 2025, at 16:26, Lorenzo Stoakes wrote: > Doing so means we can get rid of all the weird struct vm_area_struct **prev > stuff, everything becomes consistent and in future if we want to make > change to behaviour there's a single place where all relevant state is > stored. > > This also allows us to update try_vma_read_lock() to be a little more > succinct and set up state for us, as well as cleaning up > madvise_update_vma(). > > We also update the debug assertion prior to madvise_update_vma() to assert > that this is a write operation as correctly pointed out by Barry in the > relevant thread. > > We can't reasonably update the madvise functions that live outside of > mm/madvise.c so we leave those as-is. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > --- > mm/madvise.c | 283 ++++++++++++++++++++++++++------------------------- > 1 file changed, 146 insertions(+), 137 deletions(-) > Acked-by: Zi Yan <ziy@nvidia.com> -- Best Regards, Yan, Zi
On Thu, Jun 19, 2025 at 10:20:23PM -0400, Zi Yan wrote: > On 19 Jun 2025, at 16:26, Lorenzo Stoakes wrote: > > > Doing so means we can get rid of all the weird struct vm_area_struct **prev > > stuff, everything becomes consistent and in future if we want to make > > change to behaviour there's a single place where all relevant state is > > stored. > > > > This also allows us to update try_vma_read_lock() to be a little more > > succinct and set up state for us, as well as cleaning up > > madvise_update_vma(). > > > > We also update the debug assertion prior to madvise_update_vma() to assert > > that this is a write operation as correctly pointed out by Barry in the > > relevant thread. > > > > We can't reasonably update the madvise functions that live outside of > > mm/madvise.c so we leave those as-is. > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > --- > > mm/madvise.c | 283 ++++++++++++++++++++++++++------------------------- > > 1 file changed, 146 insertions(+), 137 deletions(-) > > > > Acked-by: Zi Yan <ziy@nvidia.com> Thanks! > > -- > Best Regards, > Yan, Zi
© 2016 - 2025 Red Hat, Inc.