include/linux/pagewalk.h | 2 ++ mm/madvise.c | 20 ++++++++++++++------ mm/pagewalk.c | 6 ++++++ 3 files changed, 22 insertions(+), 6 deletions(-)
From: Barry Song <v-songbaohua@oppo.com>
MADV_FREE is another option, besides MADV_DONTNEED, for dynamic memory
freeing in user-space native or Java heap memory management. For example,
jemalloc can be configured to use MADV_FREE, and recent versions of the
Android Java heap have also increasingly adopted MADV_FREE. Supporting
per-VMA locking for MADV_FREE thus appears increasingly necessary.
We have replaced walk_page_range() with walk_page_range_vma(). Along with
the proposed madvise_lock_mode by Lorenzo, the necessary infrastructure is
now in place to begin exploring per-VMA locking support for MADV_FREE and
potentially other madvise using walk_page_range_vma().
This patch adds support for the PGWALK_VMA_RDLOCK walk_lock mode in
walk_page_range_vma(), and leverages madvise_lock_mode from
madv_behavior to select the appropriate walk_lock—either mmap_lock or
per-VMA lock—based on the context.
To ensure thread safety, madvise_free_walk_ops is now defined as a stack
variable instead of a global constant.
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jann Horn <jannh@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Lokesh Gidra <lokeshgidra@google.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Tangquan Zheng <zhengtangquan@oppo.com>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
---
include/linux/pagewalk.h | 2 ++
mm/madvise.c | 20 ++++++++++++++------
mm/pagewalk.c | 6 ++++++
3 files changed, 22 insertions(+), 6 deletions(-)
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index 9700a29f8afb..a4afa64ef0ab 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -14,6 +14,8 @@ enum page_walk_lock {
PGWALK_WRLOCK = 1,
/* vma is expected to be already write-locked during the walk */
PGWALK_WRLOCK_VERIFY = 2,
+ /* vma is expected to be already read-locked during the walk */
+ PGWALK_VMA_RDLOCK_VERIFY = 3,
};
/**
diff --git a/mm/madvise.c b/mm/madvise.c
index 381eedde8f6d..23d58eb31c8f 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -775,10 +775,14 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
-static const struct mm_walk_ops madvise_free_walk_ops = {
- .pmd_entry = madvise_free_pte_range,
- .walk_lock = PGWALK_RDLOCK,
-};
+static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode)
+{
+ /* Other modes don't require fixing up the walk_lock. */
+ VM_WARN_ON_ONCE(mode != MADVISE_VMA_READ_LOCK &&
+ mode != MADVISE_MMAP_READ_LOCK);
+ return mode == MADVISE_VMA_READ_LOCK ?
+ PGWALK_VMA_RDLOCK_VERIFY : PGWALK_RDLOCK;
+}
static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
struct vm_area_struct *vma,
@@ -787,6 +791,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
struct mmu_gather *tlb = madv_behavior->tlb;
+ struct mm_walk_ops walk_ops = {
+ .pmd_entry = madvise_free_pte_range,
+ };
/* MADV_FREE works for only anon vma at the moment */
if (!vma_is_anonymous(vma))
@@ -806,8 +813,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
mmu_notifier_invalidate_range_start(&range);
tlb_start_vma(tlb, vma);
+ walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode);
walk_page_range_vma(vma, range.start, range.end,
- &madvise_free_walk_ops, tlb);
+ &walk_ops, tlb);
tlb_end_vma(tlb, vma);
mmu_notifier_invalidate_range_end(&range);
return 0;
@@ -1653,7 +1661,6 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi
case MADV_WILLNEED:
case MADV_COLD:
case MADV_PAGEOUT:
- case MADV_FREE:
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
case MADV_COLLAPSE:
@@ -1662,6 +1669,7 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi
return MADVISE_MMAP_READ_LOCK;
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
+ case MADV_FREE:
return MADVISE_VMA_READ_LOCK;
default:
return MADVISE_MMAP_WRITE_LOCK;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e478777c86e1..c984aacc5552 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -420,6 +420,9 @@ static int __walk_page_range(unsigned long start, unsigned long end,
static inline void process_mm_walk_lock(struct mm_struct *mm,
enum page_walk_lock walk_lock)
{
+ if (walk_lock == PGWALK_VMA_RDLOCK_VERIFY)
+ return;
+
if (walk_lock == PGWALK_RDLOCK)
mmap_assert_locked(mm);
else
@@ -437,6 +440,9 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma,
case PGWALK_WRLOCK_VERIFY:
vma_assert_write_locked(vma);
break;
+ case PGWALK_VMA_RDLOCK_VERIFY:
+ vma_assert_locked(vma);
+ break;
case PGWALK_RDLOCK:
/* PGWALK_RDLOCK is handled by process_mm_walk_lock */
break;
--
2.39.3 (Apple Git-146)
On Tue, Jun 10, 2025 at 05:59:20PM +1200, Barry Song wrote: > From: Barry Song <v-songbaohua@oppo.com> > > MADV_FREE is another option, besides MADV_DONTNEED, for dynamic memory > freeing in user-space native or Java heap memory management. For example, > jemalloc can be configured to use MADV_FREE, and recent versions of the > Android Java heap have also increasingly adopted MADV_FREE. Supporting > per-VMA locking for MADV_FREE thus appears increasingly necessary. > > We have replaced walk_page_range() with walk_page_range_vma(). Along with > the proposed madvise_lock_mode by Lorenzo, the necessary infrastructure is > now in place to begin exploring per-VMA locking support for MADV_FREE and > potentially other madvise using walk_page_range_vma(). Thanks :) > > This patch adds support for the PGWALK_VMA_RDLOCK walk_lock mode in > walk_page_range_vma(), and leverages madvise_lock_mode from > madv_behavior to select the appropriate walk_lock—either mmap_lock or > per-VMA lock—based on the context. > > To ensure thread safety, madvise_free_walk_ops is now defined as a stack > variable instead of a global constant. A nit but I'd add 'because we now dynamically update the walk_ops->walk_lock field we must make sure this is thread safe' or something like this to clarify the need for this Did we not have to worry about this before I guess because the mmap lock would exclude other threads? An aside, but I wonder if we have this implicit assumption elsewhere that VMA locks defeat... hm :) > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com> > Cc: David Hildenbrand <david@redhat.com> > Cc: Vlastimil Babka <vbabka@suse.cz> > Cc: Jann Horn <jannh@google.com> > Cc: Suren Baghdasaryan <surenb@google.com> > Cc: Lokesh Gidra <lokeshgidra@google.com> > Cc: Mike Rapoport <rppt@kernel.org> > Cc: Michal Hocko <mhocko@suse.com> > Cc: Tangquan Zheng <zhengtangquan@oppo.com> > Cc: Qi Zheng <zhengqi.arch@bytedance.com> > Signed-off-by: Barry Song <v-songbaohua@oppo.com> Looks good to me, kinda neat how the previous work for the MADV_DONTNEED under VMA lock stuff made this pretty straightforward :) So: Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > --- > include/linux/pagewalk.h | 2 ++ > mm/madvise.c | 20 ++++++++++++++------ > mm/pagewalk.c | 6 ++++++ > 3 files changed, 22 insertions(+), 6 deletions(-) > > diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h > index 9700a29f8afb..a4afa64ef0ab 100644 > --- a/include/linux/pagewalk.h > +++ b/include/linux/pagewalk.h > @@ -14,6 +14,8 @@ enum page_walk_lock { > PGWALK_WRLOCK = 1, > /* vma is expected to be already write-locked during the walk */ > PGWALK_WRLOCK_VERIFY = 2, > + /* vma is expected to be already read-locked during the walk */ > + PGWALK_VMA_RDLOCK_VERIFY = 3, > }; > > /** > diff --git a/mm/madvise.c b/mm/madvise.c > index 381eedde8f6d..23d58eb31c8f 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -775,10 +775,14 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, > return 0; > } > > -static const struct mm_walk_ops madvise_free_walk_ops = { > - .pmd_entry = madvise_free_pte_range, > - .walk_lock = PGWALK_RDLOCK, > -}; > +static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode) > +{ > + /* Other modes don't require fixing up the walk_lock. */ > + VM_WARN_ON_ONCE(mode != MADVISE_VMA_READ_LOCK && > + mode != MADVISE_MMAP_READ_LOCK); I find this a bit hard to parse... > + return mode == MADVISE_VMA_READ_LOCK ? > + PGWALK_VMA_RDLOCK_VERIFY : PGWALK_RDLOCK; ...might be better as something like: switch (mode) { case MADVISE_VMA_READ_LOCK: return PGWALK_VMA_RDLOCK_VERIFY; case MADVISE_MMAP_READ_LOCK: return PGWALK_RDLOCK; default: /* Invalid. */ WARN_ON_ONCE(1); return PGWALK_RDLOCK; } > +} > > static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, > struct vm_area_struct *vma, > @@ -787,6 +791,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, > struct mm_struct *mm = vma->vm_mm; > struct mmu_notifier_range range; > struct mmu_gather *tlb = madv_behavior->tlb; > + struct mm_walk_ops walk_ops = { > + .pmd_entry = madvise_free_pte_range, > + }; > > /* MADV_FREE works for only anon vma at the moment */ > if (!vma_is_anonymous(vma)) > @@ -806,8 +813,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, > > mmu_notifier_invalidate_range_start(&range); > tlb_start_vma(tlb, vma); > + walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode); > walk_page_range_vma(vma, range.start, range.end, > - &madvise_free_walk_ops, tlb); > + &walk_ops, tlb); > tlb_end_vma(tlb, vma); > mmu_notifier_invalidate_range_end(&range); > return 0; > @@ -1653,7 +1661,6 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi > case MADV_WILLNEED: > case MADV_COLD: > case MADV_PAGEOUT: > - case MADV_FREE: > case MADV_POPULATE_READ: > case MADV_POPULATE_WRITE: > case MADV_COLLAPSE: > @@ -1662,6 +1669,7 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi > return MADVISE_MMAP_READ_LOCK; > case MADV_DONTNEED: > case MADV_DONTNEED_LOCKED: > + case MADV_FREE: > return MADVISE_VMA_READ_LOCK; > default: > return MADVISE_MMAP_WRITE_LOCK; > diff --git a/mm/pagewalk.c b/mm/pagewalk.c > index e478777c86e1..c984aacc5552 100644 > --- a/mm/pagewalk.c > +++ b/mm/pagewalk.c > @@ -420,6 +420,9 @@ static int __walk_page_range(unsigned long start, unsigned long end, > static inline void process_mm_walk_lock(struct mm_struct *mm, > enum page_walk_lock walk_lock) > { > + if (walk_lock == PGWALK_VMA_RDLOCK_VERIFY) > + return; > + > if (walk_lock == PGWALK_RDLOCK) > mmap_assert_locked(mm); > else > @@ -437,6 +440,9 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma, > case PGWALK_WRLOCK_VERIFY: > vma_assert_write_locked(vma); > break; > + case PGWALK_VMA_RDLOCK_VERIFY: > + vma_assert_locked(vma); > + break; > case PGWALK_RDLOCK: > /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ > break; > -- > 2.39.3 (Apple Git-146) >
On Wed, Jun 11, 2025 at 6:52 AM Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > > On Tue, Jun 10, 2025 at 05:59:20PM +1200, Barry Song wrote: > > From: Barry Song <v-songbaohua@oppo.com> > > > > MADV_FREE is another option, besides MADV_DONTNEED, for dynamic memory > > freeing in user-space native or Java heap memory management. For example, > > jemalloc can be configured to use MADV_FREE, and recent versions of the > > Android Java heap have also increasingly adopted MADV_FREE. Supporting > > per-VMA locking for MADV_FREE thus appears increasingly necessary. > > > > We have replaced walk_page_range() with walk_page_range_vma(). Along with > > the proposed madvise_lock_mode by Lorenzo, the necessary infrastructure is > > now in place to begin exploring per-VMA locking support for MADV_FREE and > > potentially other madvise using walk_page_range_vma(). > > Thanks :) > > > > > This patch adds support for the PGWALK_VMA_RDLOCK walk_lock mode in > > walk_page_range_vma(), and leverages madvise_lock_mode from > > madv_behavior to select the appropriate walk_lock—either mmap_lock or > > per-VMA lock—based on the context. > > > > To ensure thread safety, madvise_free_walk_ops is now defined as a stack > > variable instead of a global constant. > > A nit but I'd add 'because we now dynamically update the walk_ops->walk_lock > field we must make sure this is thread safe' or something like this to clarify > the need for this Sure. > > Did we not have to worry about this before I guess because the mmap lock would > exclude other threads? Probably not. It was a constant, and no one needed to modify it before, no matter how many threads called MADV_FREE. > > An aside, but I wonder if we have this implicit assumption elsewhere that VMA > locks defeat... hm :) > > > > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com> > > Cc: David Hildenbrand <david@redhat.com> > > Cc: Vlastimil Babka <vbabka@suse.cz> > > Cc: Jann Horn <jannh@google.com> > > Cc: Suren Baghdasaryan <surenb@google.com> > > Cc: Lokesh Gidra <lokeshgidra@google.com> > > Cc: Mike Rapoport <rppt@kernel.org> > > Cc: Michal Hocko <mhocko@suse.com> > > Cc: Tangquan Zheng <zhengtangquan@oppo.com> > > Cc: Qi Zheng <zhengqi.arch@bytedance.com> > > Signed-off-by: Barry Song <v-songbaohua@oppo.com> > > Looks good to me, kinda neat how the previous work for the MADV_DONTNEED under > VMA lock stuff made this pretty straightforward :) > > So: > > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Thanks! > > > --- > > include/linux/pagewalk.h | 2 ++ > > mm/madvise.c | 20 ++++++++++++++------ > > mm/pagewalk.c | 6 ++++++ > > 3 files changed, 22 insertions(+), 6 deletions(-) > > > > diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h > > index 9700a29f8afb..a4afa64ef0ab 100644 > > --- a/include/linux/pagewalk.h > > +++ b/include/linux/pagewalk.h > > @@ -14,6 +14,8 @@ enum page_walk_lock { > > PGWALK_WRLOCK = 1, > > /* vma is expected to be already write-locked during the walk */ > > PGWALK_WRLOCK_VERIFY = 2, > > + /* vma is expected to be already read-locked during the walk */ > > + PGWALK_VMA_RDLOCK_VERIFY = 3, > > }; > > > > /** > > diff --git a/mm/madvise.c b/mm/madvise.c > > index 381eedde8f6d..23d58eb31c8f 100644 > > --- a/mm/madvise.c > > +++ b/mm/madvise.c > > @@ -775,10 +775,14 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, > > return 0; > > } > > > > -static const struct mm_walk_ops madvise_free_walk_ops = { > > - .pmd_entry = madvise_free_pte_range, > > - .walk_lock = PGWALK_RDLOCK, > > -}; > > +static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode) > > +{ > > + /* Other modes don't require fixing up the walk_lock. */ > > + VM_WARN_ON_ONCE(mode != MADVISE_VMA_READ_LOCK && > > + mode != MADVISE_MMAP_READ_LOCK); > > I find this a bit hard to parse... > > > + return mode == MADVISE_VMA_READ_LOCK ? > > + PGWALK_VMA_RDLOCK_VERIFY : PGWALK_RDLOCK; > > ...might be better as something like: > > switch (mode) { > case MADVISE_VMA_READ_LOCK: > return PGWALK_VMA_RDLOCK_VERIFY; > case MADVISE_MMAP_READ_LOCK: > return PGWALK_RDLOCK; > default: > /* Invalid. */ > WARN_ON_ONCE(1); > return PGWALK_RDLOCK; > } I actually tried both before sending and, for some reason, preferred the one I sent. But I'm totally happy to go with whichever approach you prefer:-) > > > +} > > > > static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, > > struct vm_area_struct *vma, > > @@ -787,6 +791,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, > > struct mm_struct *mm = vma->vm_mm; > > struct mmu_notifier_range range; > > struct mmu_gather *tlb = madv_behavior->tlb; > > + struct mm_walk_ops walk_ops = { > > + .pmd_entry = madvise_free_pte_range, > > + }; > > > > /* MADV_FREE works for only anon vma at the moment */ > > if (!vma_is_anonymous(vma)) > > @@ -806,8 +813,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, > > > > mmu_notifier_invalidate_range_start(&range); > > tlb_start_vma(tlb, vma); > > + walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode); > > walk_page_range_vma(vma, range.start, range.end, > > - &madvise_free_walk_ops, tlb); > > + &walk_ops, tlb); > > tlb_end_vma(tlb, vma); > > mmu_notifier_invalidate_range_end(&range); > > return 0; > > @@ -1653,7 +1661,6 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi > > case MADV_WILLNEED: > > case MADV_COLD: > > case MADV_PAGEOUT: > > - case MADV_FREE: > > case MADV_POPULATE_READ: > > case MADV_POPULATE_WRITE: > > case MADV_COLLAPSE: > > @@ -1662,6 +1669,7 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi > > return MADVISE_MMAP_READ_LOCK; > > case MADV_DONTNEED: > > case MADV_DONTNEED_LOCKED: > > + case MADV_FREE: > > return MADVISE_VMA_READ_LOCK; > > default: > > return MADVISE_MMAP_WRITE_LOCK; > > diff --git a/mm/pagewalk.c b/mm/pagewalk.c > > index e478777c86e1..c984aacc5552 100644 > > --- a/mm/pagewalk.c > > +++ b/mm/pagewalk.c > > @@ -420,6 +420,9 @@ static int __walk_page_range(unsigned long start, unsigned long end, > > static inline void process_mm_walk_lock(struct mm_struct *mm, > > enum page_walk_lock walk_lock) > > { > > + if (walk_lock == PGWALK_VMA_RDLOCK_VERIFY) > > + return; > > + > > if (walk_lock == PGWALK_RDLOCK) > > mmap_assert_locked(mm); > > else > > @@ -437,6 +440,9 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma, > > case PGWALK_WRLOCK_VERIFY: > > vma_assert_write_locked(vma); > > break; > > + case PGWALK_VMA_RDLOCK_VERIFY: > > + vma_assert_locked(vma); > > + break; > > case PGWALK_RDLOCK: > > /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ > > break; > > -- > > 2.39.3 (Apple Git-146) > > Thanks Barry
On Wed, Jun 11, 2025 at 10:03:05PM +1200, Barry Song wrote: > On Wed, Jun 11, 2025 at 6:52 AM Lorenzo Stoakes > <lorenzo.stoakes@oracle.com> wrote: > > > > On Tue, Jun 10, 2025 at 05:59:20PM +1200, Barry Song wrote: > > > From: Barry Song <v-songbaohua@oppo.com> > > > > > > MADV_FREE is another option, besides MADV_DONTNEED, for dynamic memory > > > freeing in user-space native or Java heap memory management. For example, > > > jemalloc can be configured to use MADV_FREE, and recent versions of the > > > Android Java heap have also increasingly adopted MADV_FREE. Supporting > > > per-VMA locking for MADV_FREE thus appears increasingly necessary. > > > > > > We have replaced walk_page_range() with walk_page_range_vma(). Along with > > > the proposed madvise_lock_mode by Lorenzo, the necessary infrastructure is > > > now in place to begin exploring per-VMA locking support for MADV_FREE and > > > potentially other madvise using walk_page_range_vma(). > > > > Thanks :) > > > > > > > > This patch adds support for the PGWALK_VMA_RDLOCK walk_lock mode in > > > walk_page_range_vma(), and leverages madvise_lock_mode from > > > madv_behavior to select the appropriate walk_lock—either mmap_lock or > > > per-VMA lock—based on the context. > > > > > > To ensure thread safety, madvise_free_walk_ops is now defined as a stack > > > variable instead of a global constant. > > > > A nit but I'd add 'because we now dynamically update the walk_ops->walk_lock > > field we must make sure this is thread safe' or something like this to clarify > > the need for this > > Sure. Thanks! > > > > > Did we not have to worry about this before I guess because the mmap lock would > > exclude other threads? > > Probably not. It was a constant, and no one needed to modify it > before, no matter how many threads called MADV_FREE. Yeah of course, I wrote this before I went and looked more carefully and comment preceding this and... anyway yeah :) > > > > > An aside, but I wonder if we have this implicit assumption elsewhere that VMA > > locks defeat... hm :) This is more the concern, but not to do with your series but more a general thing 'in how many places do we make implicit assumptions about things like thread concurrency due to locking we presume we have?' Hopefully this isn't common :>) > > > > > > > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > > Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com> > > > Cc: David Hildenbrand <david@redhat.com> > > > Cc: Vlastimil Babka <vbabka@suse.cz> > > > Cc: Jann Horn <jannh@google.com> > > > Cc: Suren Baghdasaryan <surenb@google.com> > > > Cc: Lokesh Gidra <lokeshgidra@google.com> > > > Cc: Mike Rapoport <rppt@kernel.org> > > > Cc: Michal Hocko <mhocko@suse.com> > > > Cc: Tangquan Zheng <zhengtangquan@oppo.com> > > > Cc: Qi Zheng <zhengqi.arch@bytedance.com> > > > Signed-off-by: Barry Song <v-songbaohua@oppo.com> > > > > Looks good to me, kinda neat how the previous work for the MADV_DONTNEED under > > VMA lock stuff made this pretty straightforward :) > > > > So: > > > > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > Thanks! No problem! Thanks for the patch, this is a nice step forward. > > > > > > --- > > > include/linux/pagewalk.h | 2 ++ > > > mm/madvise.c | 20 ++++++++++++++------ > > > mm/pagewalk.c | 6 ++++++ > > > 3 files changed, 22 insertions(+), 6 deletions(-) > > > > > > diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h > > > index 9700a29f8afb..a4afa64ef0ab 100644 > > > --- a/include/linux/pagewalk.h > > > +++ b/include/linux/pagewalk.h > > > @@ -14,6 +14,8 @@ enum page_walk_lock { > > > PGWALK_WRLOCK = 1, > > > /* vma is expected to be already write-locked during the walk */ > > > PGWALK_WRLOCK_VERIFY = 2, > > > + /* vma is expected to be already read-locked during the walk */ > > > + PGWALK_VMA_RDLOCK_VERIFY = 3, > > > }; > > > > > > /** > > > diff --git a/mm/madvise.c b/mm/madvise.c > > > index 381eedde8f6d..23d58eb31c8f 100644 > > > --- a/mm/madvise.c > > > +++ b/mm/madvise.c > > > @@ -775,10 +775,14 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, > > > return 0; > > > } > > > > > > -static const struct mm_walk_ops madvise_free_walk_ops = { > > > - .pmd_entry = madvise_free_pte_range, > > > - .walk_lock = PGWALK_RDLOCK, > > > -}; > > > +static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode) > > > +{ > > > + /* Other modes don't require fixing up the walk_lock. */ > > > + VM_WARN_ON_ONCE(mode != MADVISE_VMA_READ_LOCK && > > > + mode != MADVISE_MMAP_READ_LOCK); > > > > I find this a bit hard to parse... > > > > > + return mode == MADVISE_VMA_READ_LOCK ? > > > + PGWALK_VMA_RDLOCK_VERIFY : PGWALK_RDLOCK; > > > > ...might be better as something like: > > > > switch (mode) { > > case MADVISE_VMA_READ_LOCK: > > return PGWALK_VMA_RDLOCK_VERIFY; > > case MADVISE_MMAP_READ_LOCK: > > return PGWALK_RDLOCK; > > default: > > /* Invalid. */ > > WARN_ON_ONCE(1); > > return PGWALK_RDLOCK; > > } > > I actually tried both before sending and, for some reason, preferred > the one I sent. But I'm totally happy to go with whichever approach > you prefer:-) It's not a big deal, obviously I'd prefer this version but not a blocker! > > > > > > +} > > > > > > static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, > > > struct vm_area_struct *vma, > > > @@ -787,6 +791,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, > > > struct mm_struct *mm = vma->vm_mm; > > > struct mmu_notifier_range range; > > > struct mmu_gather *tlb = madv_behavior->tlb; > > > + struct mm_walk_ops walk_ops = { > > > + .pmd_entry = madvise_free_pte_range, > > > + }; > > > > > > /* MADV_FREE works for only anon vma at the moment */ > > > if (!vma_is_anonymous(vma)) > > > @@ -806,8 +813,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, > > > > > > mmu_notifier_invalidate_range_start(&range); > > > tlb_start_vma(tlb, vma); > > > + walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode); > > > walk_page_range_vma(vma, range.start, range.end, > > > - &madvise_free_walk_ops, tlb); > > > + &walk_ops, tlb); > > > tlb_end_vma(tlb, vma); > > > mmu_notifier_invalidate_range_end(&range); > > > return 0; > > > @@ -1653,7 +1661,6 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi > > > case MADV_WILLNEED: > > > case MADV_COLD: > > > case MADV_PAGEOUT: > > > - case MADV_FREE: > > > case MADV_POPULATE_READ: > > > case MADV_POPULATE_WRITE: > > > case MADV_COLLAPSE: > > > @@ -1662,6 +1669,7 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi > > > return MADVISE_MMAP_READ_LOCK; > > > case MADV_DONTNEED: > > > case MADV_DONTNEED_LOCKED: > > > + case MADV_FREE: > > > return MADVISE_VMA_READ_LOCK; > > > default: > > > return MADVISE_MMAP_WRITE_LOCK; > > > diff --git a/mm/pagewalk.c b/mm/pagewalk.c > > > index e478777c86e1..c984aacc5552 100644 > > > --- a/mm/pagewalk.c > > > +++ b/mm/pagewalk.c > > > @@ -420,6 +420,9 @@ static int __walk_page_range(unsigned long start, unsigned long end, > > > static inline void process_mm_walk_lock(struct mm_struct *mm, > > > enum page_walk_lock walk_lock) > > > { > > > + if (walk_lock == PGWALK_VMA_RDLOCK_VERIFY) > > > + return; > > > + > > > if (walk_lock == PGWALK_RDLOCK) > > > mmap_assert_locked(mm); > > > else > > > @@ -437,6 +440,9 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma, > > > case PGWALK_WRLOCK_VERIFY: > > > vma_assert_write_locked(vma); > > > break; > > > + case PGWALK_VMA_RDLOCK_VERIFY: > > > + vma_assert_locked(vma); > > > + break; > > > case PGWALK_RDLOCK: > > > /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ > > > break; > > > -- > > > 2.39.3 (Apple Git-146) > > > > > Thanks > Barry Cheers, Lorenzo
On 10.06.25 07:59, Barry Song wrote: > From: Barry Song <v-songbaohua@oppo.com> > > MADV_FREE is another option, besides MADV_DONTNEED, for dynamic memory > freeing in user-space native or Java heap memory management. For example, > jemalloc can be configured to use MADV_FREE, and recent versions of the > Android Java heap have also increasingly adopted MADV_FREE. Supporting > per-VMA locking for MADV_FREE thus appears increasingly necessary. > > We have replaced walk_page_range() with walk_page_range_vma(). Along with > the proposed madvise_lock_mode by Lorenzo, the necessary infrastructure is > now in place to begin exploring per-VMA locking support for MADV_FREE and > potentially other madvise using walk_page_range_vma(). > > This patch adds support for the PGWALK_VMA_RDLOCK walk_lock mode in > walk_page_range_vma(), and leverages madvise_lock_mode from > madv_behavior to select the appropriate walk_lock—either mmap_lock or > per-VMA lock—based on the context. > > To ensure thread safety, madvise_free_walk_ops is now defined as a stack > variable instead of a global constant. > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com> > Cc: David Hildenbrand <david@redhat.com> > Cc: Vlastimil Babka <vbabka@suse.cz> > Cc: Jann Horn <jannh@google.com> > Cc: Suren Baghdasaryan <surenb@google.com> > Cc: Lokesh Gidra <lokeshgidra@google.com> > Cc: Mike Rapoport <rppt@kernel.org> > Cc: Michal Hocko <mhocko@suse.com> > Cc: Tangquan Zheng <zhengtangquan@oppo.com> > Cc: Qi Zheng <zhengqi.arch@bytedance.com> > Signed-off-by: Barry Song <v-songbaohua@oppo.com> > --- > include/linux/pagewalk.h | 2 ++ > mm/madvise.c | 20 ++++++++++++++------ > mm/pagewalk.c | 6 ++++++ > 3 files changed, 22 insertions(+), 6 deletions(-) > > diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h > index 9700a29f8afb..a4afa64ef0ab 100644 > --- a/include/linux/pagewalk.h > +++ b/include/linux/pagewalk.h > @@ -14,6 +14,8 @@ enum page_walk_lock { > PGWALK_WRLOCK = 1, > /* vma is expected to be already write-locked during the walk */ > PGWALK_WRLOCK_VERIFY = 2, > + /* vma is expected to be already read-locked during the walk */ > + PGWALK_VMA_RDLOCK_VERIFY = 3, > }; > > /** > diff --git a/mm/madvise.c b/mm/madvise.c > index 381eedde8f6d..23d58eb31c8f 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -775,10 +775,14 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, > return 0; > } > > -static const struct mm_walk_ops madvise_free_walk_ops = { > - .pmd_entry = madvise_free_pte_range, > - .walk_lock = PGWALK_RDLOCK, > -}; > +static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode) > +{ > + /* Other modes don't require fixing up the walk_lock. */ > + VM_WARN_ON_ONCE(mode != MADVISE_VMA_READ_LOCK && > + mode != MADVISE_MMAP_READ_LOCK); > + return mode == MADVISE_VMA_READ_LOCK ? > + PGWALK_VMA_RDLOCK_VERIFY : PGWALK_RDLOCK; > +} > > static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, > struct vm_area_struct *vma, > @@ -787,6 +791,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, > struct mm_struct *mm = vma->vm_mm; > struct mmu_notifier_range range; > struct mmu_gather *tlb = madv_behavior->tlb; > + struct mm_walk_ops walk_ops = { > + .pmd_entry = madvise_free_pte_range, > + }; > > /* MADV_FREE works for only anon vma at the moment */ > if (!vma_is_anonymous(vma)) > @@ -806,8 +813,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, > > mmu_notifier_invalidate_range_start(&range); > tlb_start_vma(tlb, vma); > + walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode); > walk_page_range_vma(vma, range.start, range.end, > - &madvise_free_walk_ops, tlb); > + &walk_ops, tlb); > tlb_end_vma(tlb, vma); > mmu_notifier_invalidate_range_end(&range); > return 0; > @@ -1653,7 +1661,6 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi > case MADV_WILLNEED: > case MADV_COLD: > case MADV_PAGEOUT: > - case MADV_FREE: > case MADV_POPULATE_READ: > case MADV_POPULATE_WRITE: > case MADV_COLLAPSE: > @@ -1662,6 +1669,7 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi > return MADVISE_MMAP_READ_LOCK; > case MADV_DONTNEED: > case MADV_DONTNEED_LOCKED: > + case MADV_FREE: > return MADVISE_VMA_READ_LOCK; > default: > return MADVISE_MMAP_WRITE_LOCK; > diff --git a/mm/pagewalk.c b/mm/pagewalk.c > index e478777c86e1..c984aacc5552 100644 > --- a/mm/pagewalk.c > +++ b/mm/pagewalk.c > @@ -420,6 +420,9 @@ static int __walk_page_range(unsigned long start, unsigned long end, > static inline void process_mm_walk_lock(struct mm_struct *mm, > enum page_walk_lock walk_lock) > { > + if (walk_lock == PGWALK_VMA_RDLOCK_VERIFY) > + return; > + > if (walk_lock == PGWALK_RDLOCK) > mmap_assert_locked(mm); Nit: I'd have converted the "else" into "else if (walk_lock != PGWALK_VMA_RDLOCK_VERIFY) > else > @@ -437,6 +440,9 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma, > case PGWALK_WRLOCK_VERIFY: > vma_assert_write_locked(vma); > break; > + case PGWALK_VMA_RDLOCK_VERIFY: > + vma_assert_locked(vma); > + break; > case PGWALK_RDLOCK: > /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ > break; Nothing jumped at me an I think this should be ok Acked-by: David Hildenbrand <david@redhat.com> -- Cheers, David / dhildenb
On Tue, Jun 10, 2025 at 7:04 PM David Hildenbrand <david@redhat.com> wrote: > > On 10.06.25 07:59, Barry Song wrote: > > From: Barry Song <v-songbaohua@oppo.com> > > > > MADV_FREE is another option, besides MADV_DONTNEED, for dynamic memory > > freeing in user-space native or Java heap memory management. For example, > > jemalloc can be configured to use MADV_FREE, and recent versions of the > > Android Java heap have also increasingly adopted MADV_FREE. Supporting > > per-VMA locking for MADV_FREE thus appears increasingly necessary. > > > > We have replaced walk_page_range() with walk_page_range_vma(). Along with > > the proposed madvise_lock_mode by Lorenzo, the necessary infrastructure is > > now in place to begin exploring per-VMA locking support for MADV_FREE and > > potentially other madvise using walk_page_range_vma(). > > > > This patch adds support for the PGWALK_VMA_RDLOCK walk_lock mode in > > walk_page_range_vma(), and leverages madvise_lock_mode from > > madv_behavior to select the appropriate walk_lock—either mmap_lock or > > per-VMA lock—based on the context. > > > > To ensure thread safety, madvise_free_walk_ops is now defined as a stack > > variable instead of a global constant. > > > > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com> > > Cc: David Hildenbrand <david@redhat.com> > > Cc: Vlastimil Babka <vbabka@suse.cz> > > Cc: Jann Horn <jannh@google.com> > > Cc: Suren Baghdasaryan <surenb@google.com> > > Cc: Lokesh Gidra <lokeshgidra@google.com> > > Cc: Mike Rapoport <rppt@kernel.org> > > Cc: Michal Hocko <mhocko@suse.com> > > Cc: Tangquan Zheng <zhengtangquan@oppo.com> > > Cc: Qi Zheng <zhengqi.arch@bytedance.com> > > Signed-off-by: Barry Song <v-songbaohua@oppo.com> > > --- > > include/linux/pagewalk.h | 2 ++ > > mm/madvise.c | 20 ++++++++++++++------ > > mm/pagewalk.c | 6 ++++++ > > 3 files changed, 22 insertions(+), 6 deletions(-) > > > > diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h > > index 9700a29f8afb..a4afa64ef0ab 100644 > > --- a/include/linux/pagewalk.h > > +++ b/include/linux/pagewalk.h > > @@ -14,6 +14,8 @@ enum page_walk_lock { > > PGWALK_WRLOCK = 1, > > /* vma is expected to be already write-locked during the walk */ > > PGWALK_WRLOCK_VERIFY = 2, > > + /* vma is expected to be already read-locked during the walk */ > > + PGWALK_VMA_RDLOCK_VERIFY = 3, > > }; > > > > /** > > diff --git a/mm/madvise.c b/mm/madvise.c > > index 381eedde8f6d..23d58eb31c8f 100644 > > --- a/mm/madvise.c > > +++ b/mm/madvise.c > > @@ -775,10 +775,14 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, > > return 0; > > } > > > > -static const struct mm_walk_ops madvise_free_walk_ops = { > > - .pmd_entry = madvise_free_pte_range, > > - .walk_lock = PGWALK_RDLOCK, > > -}; > > +static inline enum page_walk_lock get_walk_lock(enum madvise_lock_mode mode) > > +{ > > + /* Other modes don't require fixing up the walk_lock. */ > > + VM_WARN_ON_ONCE(mode != MADVISE_VMA_READ_LOCK && > > + mode != MADVISE_MMAP_READ_LOCK); > > + return mode == MADVISE_VMA_READ_LOCK ? > > + PGWALK_VMA_RDLOCK_VERIFY : PGWALK_RDLOCK; > > +} > > > > static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, > > struct vm_area_struct *vma, > > @@ -787,6 +791,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, > > struct mm_struct *mm = vma->vm_mm; > > struct mmu_notifier_range range; > > struct mmu_gather *tlb = madv_behavior->tlb; > > + struct mm_walk_ops walk_ops = { > > + .pmd_entry = madvise_free_pte_range, > > + }; > > > > /* MADV_FREE works for only anon vma at the moment */ > > if (!vma_is_anonymous(vma)) > > @@ -806,8 +813,9 @@ static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, > > > > mmu_notifier_invalidate_range_start(&range); > > tlb_start_vma(tlb, vma); > > + walk_ops.walk_lock = get_walk_lock(madv_behavior->lock_mode); > > walk_page_range_vma(vma, range.start, range.end, > > - &madvise_free_walk_ops, tlb); > > + &walk_ops, tlb); > > tlb_end_vma(tlb, vma); > > mmu_notifier_invalidate_range_end(&range); > > return 0; > > @@ -1653,7 +1661,6 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi > > case MADV_WILLNEED: > > case MADV_COLD: > > case MADV_PAGEOUT: > > - case MADV_FREE: > > case MADV_POPULATE_READ: > > case MADV_POPULATE_WRITE: > > case MADV_COLLAPSE: > > @@ -1662,6 +1669,7 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi > > return MADVISE_MMAP_READ_LOCK; > > case MADV_DONTNEED: > > case MADV_DONTNEED_LOCKED: > > + case MADV_FREE: > > return MADVISE_VMA_READ_LOCK; > > default: > > return MADVISE_MMAP_WRITE_LOCK; > > diff --git a/mm/pagewalk.c b/mm/pagewalk.c > > index e478777c86e1..c984aacc5552 100644 > > --- a/mm/pagewalk.c > > +++ b/mm/pagewalk.c > > @@ -420,6 +420,9 @@ static int __walk_page_range(unsigned long start, unsigned long end, > > static inline void process_mm_walk_lock(struct mm_struct *mm, > > enum page_walk_lock walk_lock) > > { > > + if (walk_lock == PGWALK_VMA_RDLOCK_VERIFY) > > + return; > > + > > if (walk_lock == PGWALK_RDLOCK) > > mmap_assert_locked(mm); > > Nit: I'd have converted the "else" into "else if (walk_lock != > PGWALK_VMA_RDLOCK_VERIFY) Seems good to me. > > > else > > @@ -437,6 +440,9 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma, > > case PGWALK_WRLOCK_VERIFY: > > vma_assert_write_locked(vma); > > break; > > + case PGWALK_VMA_RDLOCK_VERIFY: > > + vma_assert_locked(vma); > > + break; > > case PGWALK_RDLOCK: > > /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ > > break; > > Nothing jumped at me an I think this should be ok > > Acked-by: David Hildenbrand <david@redhat.com> Thanks! > > -- > Cheers, > > David / dhildenb >
© 2016 - 2025 Red Hat, Inc.