[PATCH 3/5] mm/madvise: thread VMA range state through madvise_behavior

Lorenzo Stoakes posted 5 patches 3 months, 3 weeks ago
There is a newer version of this series
[PATCH 3/5] mm/madvise: thread VMA range state through madvise_behavior
Posted by Lorenzo Stoakes 3 months, 3 weeks ago
Rather than updating start and a confusing local parameter 'tmp' in
madvise_walk_vmas(), instead store the current range being operated upon in
the struct madvise_behavior helper object in a range pair and use this
consistently in all operations.

This makes it clearer what is going on and opens the door to further
cleanup now we store state regarding what is currently being operated upon
here.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
 mm/madvise.c | 101 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 55 insertions(+), 46 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 47485653c2a1..6faa38b92111 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -58,17 +58,26 @@ enum madvise_lock_mode {
 	MADVISE_VMA_READ_LOCK,
 };
 
+struct madvise_behavior_range {
+	unsigned long start, end;
+};
+
 struct madvise_behavior {
 	struct mm_struct *mm;
 	int behavior;
 	struct mmu_gather *tlb;
 	enum madvise_lock_mode lock_mode;
 	struct anon_vma_name *anon_name;
+
+	/*
+	 * The range over which the behaviour is currently being applied. If
+	 * traversing multiple VMAs, this is updated for each.
+	 */
+	struct madvise_behavior_range range;
 };
 
 #ifdef CONFIG_ANON_VMA_NAME
-static int madvise_walk_vmas(unsigned long start, unsigned long end,
-		struct madvise_behavior *madv_behavior);
+static int madvise_walk_vmas(struct madvise_behavior *madv_behavior);
 
 struct anon_vma_name *anon_vma_name_alloc(const char *name)
 {
@@ -149,8 +158,9 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 
 	madv_behavior.behavior =
 		anon_name ? __MADV_SET_ANON_VMA_NAME : __MADV_CLEAR_ANON_VMA_NAME;
-
-	return madvise_walk_vmas(start, end, &madv_behavior);
+	madv_behavior.range.start = start;
+	madv_behavior.range.end = end;
+	return madvise_walk_vmas(&madv_behavior);
 }
 
 static bool is_anon_vma_name(int behavior)
@@ -1012,12 +1022,13 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
 		return -EINVAL;
 }
 
-static long madvise_populate(unsigned long start, unsigned long end,
-		struct madvise_behavior *madv_behavior)
+static long madvise_populate(struct madvise_behavior *madv_behavior)
 {
 	struct mm_struct *mm = madv_behavior->mm;
 	const bool write = madv_behavior->behavior == MADV_POPULATE_WRITE;
 	int locked = 1;
+	unsigned long start = madv_behavior->range.start;
+	unsigned long end = madv_behavior->range.end;
 	long pages;
 
 	while (start < end) {
@@ -1308,12 +1319,13 @@ static long madvise_guard_remove(struct vm_area_struct *vma,
  */
 static int madvise_vma_behavior(struct vm_area_struct *vma,
 				struct vm_area_struct **prev,
-				unsigned long start, unsigned long end,
 				struct madvise_behavior *madv_behavior)
 {
 	int behavior = madv_behavior->behavior;
 	struct anon_vma_name *anon_name = madv_behavior->anon_name;
 	vm_flags_t new_flags = vma->vm_flags;
+	unsigned long start = madv_behavior->range.start;
+	unsigned long end = madv_behavior->range.end;
 	int error;
 
 	if (unlikely(!can_modify_vma_madv(vma, behavior)))
@@ -1425,10 +1437,11 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
 /*
  * Error injection support for memory error handling.
  */
-static int madvise_inject_error(unsigned long start, unsigned long end,
-		struct madvise_behavior *madv_behavior)
+static int madvise_inject_error(struct madvise_behavior *madv_behavior)
 {
 	unsigned long size;
+	unsigned long start = madv_behavior->range.start;
+	unsigned long end = madv_behavior->range.end;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -1482,8 +1495,7 @@ static bool is_memory_failure(struct madvise_behavior *madv_behavior)
 
 #else
 
-static int madvise_inject_error(unsigned long start, unsigned long end,
-		struct madvise_behavior *madv_behavior)
+static int madvise_inject_error(struct madvise_behavior *madv_behavior)
 {
 	return 0;
 }
@@ -1565,20 +1577,20 @@ static bool process_madvise_remote_valid(int behavior)
  * If a VMA read lock could not be acquired, we return NULL and expect caller to
  * fallback to mmap lock behaviour.
  */
-static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm,
-		struct madvise_behavior *madv_behavior,
-		unsigned long start, unsigned long end)
+static
+struct vm_area_struct *try_vma_read_lock(struct madvise_behavior *madv_behavior)
 {
+	struct mm_struct *mm = madv_behavior->mm;
 	struct vm_area_struct *vma;
 
-	vma = lock_vma_under_rcu(mm, start);
+	vma = lock_vma_under_rcu(mm, madv_behavior->range.start);
 	if (!vma)
 		goto take_mmap_read_lock;
 	/*
 	 * Must span only a single VMA; uffd and remote processes are
 	 * unsupported.
 	 */
-	if (end > vma->vm_end || current->mm != mm ||
+	if (madv_behavior->range.end > vma->vm_end || current->mm != mm ||
 	    userfaultfd_armed(vma)) {
 		vma_end_read(vma);
 		goto take_mmap_read_lock;
@@ -1600,13 +1612,13 @@ static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm,
  * Must be called with the mmap_lock held for reading or writing.
  */
 static
-int madvise_walk_vmas(unsigned long start, unsigned long end,
-		      struct madvise_behavior *madv_behavior)
+int madvise_walk_vmas(struct madvise_behavior *madv_behavior)
 {
 	struct mm_struct *mm = madv_behavior->mm;
+	struct madvise_behavior_range *range = &madv_behavior->range;
+	unsigned long end = range->end;
 	struct vm_area_struct *vma;
 	struct vm_area_struct *prev;
-	unsigned long tmp;
 	int unmapped_error = 0;
 	int error;
 
@@ -1615,11 +1627,10 @@ int madvise_walk_vmas(unsigned long start, unsigned long end,
 	 * tentatively, avoiding walking VMAs.
 	 */
 	if (madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) {
-		vma = try_vma_read_lock(mm, madv_behavior, start, end);
+		vma = try_vma_read_lock(madv_behavior);
 		if (vma) {
 			prev = vma;
-			error = madvise_vma_behavior(vma, &prev, start, end,
-						     madv_behavior);
+			error = madvise_vma_behavior(vma, &prev, madv_behavior);
 			vma_end_read(vma);
 			return error;
 		}
@@ -1630,8 +1641,8 @@ int madvise_walk_vmas(unsigned long start, unsigned long end,
 	 * ranges, just ignore them, but return -ENOMEM at the end.
 	 * - different from the way of handling in mlock etc.
 	 */
-	vma = find_vma_prev(mm, start, &prev);
-	if (vma && start > vma->vm_start)
+	vma = find_vma_prev(mm, range->start, &prev);
+	if (vma && range->start > vma->vm_start)
 		prev = vma;
 
 	for (;;) {
@@ -1640,32 +1651,29 @@ int madvise_walk_vmas(unsigned long start, unsigned long end,
 			return -ENOMEM;
 
 		/* Here start < (end|vma->vm_end). */
-		if (start < vma->vm_start) {
+		if (range->start < vma->vm_start) {
 			unmapped_error = -ENOMEM;
-			start = vma->vm_start;
-			if (start >= end)
+			range->start = vma->vm_start;
+			if (range->start >= end)
 				break;
 		}
 
-		/* Here vma->vm_start <= start < (end|vma->vm_end) */
-		tmp = vma->vm_end;
-		if (end < tmp)
-			tmp = end;
+		/* Here vma->vm_start <= range->start < (end|vma->vm_end) */
+		range->end = min(vma->vm_end, end);
 
-		/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
-		error = madvise_vma_behavior(vma, &prev, start, tmp,
-					     madv_behavior);
+		/* Here vma->vm_start <= range->start < range->end <= (end|vma->vm_end). */
+		error = madvise_vma_behavior(vma, &prev, madv_behavior);
 		if (error)
 			return error;
-		start = tmp;
-		if (prev && start < prev->vm_end)
-			start = prev->vm_end;
-		if (start >= end)
+		range->start = range->end;
+		if (prev && range->start < prev->vm_end)
+			range->start = prev->vm_end;
+		if (range->start >= range->end)
 			break;
 		if (prev)
 			vma = find_vma(mm, prev->vm_end);
 		else	/* madvise_remove dropped mmap_lock */
-			vma = find_vma(mm, start);
+			vma = find_vma(mm, range->start);
 	}
 
 	return unmapped_error;
@@ -1846,22 +1854,23 @@ static int madvise_do_behavior(unsigned long start, size_t len_in,
 		struct madvise_behavior *madv_behavior)
 {
 	struct blk_plug plug;
-	unsigned long end;
 	int error;
+	struct madvise_behavior_range *range = &madv_behavior->range;
 
 	if (is_memory_failure(madv_behavior)) {
-		end = start + len_in;
-		return madvise_inject_error(start, end, madv_behavior);
+		range->start = start;
+		range->end = start + len_in;
+		return madvise_inject_error(madv_behavior);
 	}
 
-	start = get_untagged_addr(madv_behavior->mm, start);
-	end = start + PAGE_ALIGN(len_in);
+	range->start = get_untagged_addr(madv_behavior->mm, start);
+	range->end = range->start + PAGE_ALIGN(len_in);
 
 	blk_start_plug(&plug);
 	if (is_madvise_populate(madv_behavior))
-		error = madvise_populate(start, end, madv_behavior);
+		error = madvise_populate(madv_behavior);
 	else
-		error = madvise_walk_vmas(start, end, madv_behavior);
+		error = madvise_walk_vmas(madv_behavior);
 	blk_finish_plug(&plug);
 	return error;
 }
-- 
2.49.0
Re: [PATCH 3/5] mm/madvise: thread VMA range state through madvise_behavior
Posted by Vlastimil Babka 3 months, 2 weeks ago
On 6/19/25 22:26, Lorenzo Stoakes wrote:
> Rather than updating start and a confusing local parameter 'tmp' in
> madvise_walk_vmas(), instead store the current range being operated upon in
> the struct madvise_behavior helper object in a range pair and use this
> consistently in all operations.

Yeah much better but it still took me a bit to understand that "end" is now
the original range->end that doesn't change during the iterations. Maybe
make it const and comment?

> This makes it clearer what is going on and opens the door to further
> cleanup now we store state regarding what is currently being operated upon
> here.
> 
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>

Reviewed-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>  mm/madvise.c | 101 ++++++++++++++++++++++++++++-----------------------
>  1 file changed, 55 insertions(+), 46 deletions(-)
> 
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 47485653c2a1..6faa38b92111 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -58,17 +58,26 @@ enum madvise_lock_mode {
>  	MADVISE_VMA_READ_LOCK,
>  };
>  
> +struct madvise_behavior_range {
> +	unsigned long start, end;

I also thought multiple names on one line is only done for local variables,
but always separate declarations in structs. But I don't know if it's
documented as such or if there are pre-existing counter examples. Consider
it a non-binding agreement with Zi :)

Thanks!
Re: [PATCH 3/5] mm/madvise: thread VMA range state through madvise_behavior
Posted by Lorenzo Stoakes 3 months, 2 weeks ago
On Fri, Jun 20, 2025 at 03:49:31PM +0200, Vlastimil Babka wrote:
> On 6/19/25 22:26, Lorenzo Stoakes wrote:
> > Rather than updating start and a confusing local parameter 'tmp' in
> > madvise_walk_vmas(), instead store the current range being operated upon in
> > the struct madvise_behavior helper object in a range pair and use this
> > consistently in all operations.
>
> Yeah much better but it still took me a bit to understand that "end" is now
> the original range->end that doesn't change during the iterations. Maybe
> make it const and comment?

Sorry missed this, will adjust on respin.
Re: [PATCH 3/5] mm/madvise: thread VMA range state through madvise_behavior
Posted by Lorenzo Stoakes 3 months, 2 weeks ago
On Fri, Jun 20, 2025 at 03:49:31PM +0200, Vlastimil Babka wrote:
> On 6/19/25 22:26, Lorenzo Stoakes wrote:
> > Rather than updating start and a confusing local parameter 'tmp' in
> > madvise_walk_vmas(), instead store the current range being operated upon in
> > the struct madvise_behavior helper object in a range pair and use this
> > consistently in all operations.
>
> Yeah much better but it still took me a bit to understand that "end" is now
> the original range->end that doesn't change during the iterations. Maybe
> make it const and comment?
>
> > This makes it clearer what is going on and opens the door to further
> > cleanup now we store state regarding what is currently being operated upon
> > here.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>
> Reviewed-by: Vlastimil Babka <vbabka@suse.cz>

Thanks!

>
> > ---
> >  mm/madvise.c | 101 ++++++++++++++++++++++++++++-----------------------
> >  1 file changed, 55 insertions(+), 46 deletions(-)
> >
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 47485653c2a1..6faa38b92111 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -58,17 +58,26 @@ enum madvise_lock_mode {
> >  	MADVISE_VMA_READ_LOCK,
> >  };
> >
> > +struct madvise_behavior_range {
> > +	unsigned long start, end;
>
> I also thought multiple names on one line is only done for local variables,
> but always separate declarations in structs. But I don't know if it's
> documented as such or if there are pre-existing counter examples. Consider
> it a non-binding agreement with Zi :)

Hm I didn't realise that was a thing at struct level, and since Zi also
highlights this I'll change it :)

Sorry Zi - my confusion, I was thinking comma-separated wasn't just for locals.

>
> Thanks!
Re: [PATCH 3/5] mm/madvise: thread VMA range state through madvise_behavior
Posted by Zi Yan 3 months, 3 weeks ago
On 19 Jun 2025, at 16:26, Lorenzo Stoakes wrote:

> Rather than updating start and a confusing local parameter 'tmp' in
> madvise_walk_vmas(), instead store the current range being operated upon in
> the struct madvise_behavior helper object in a range pair and use this
> consistently in all operations.
>
> This makes it clearer what is going on and opens the door to further
> cleanup now we store state regarding what is currently being operated upon
> here.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
>  mm/madvise.c | 101 ++++++++++++++++++++++++++++-----------------------
>  1 file changed, 55 insertions(+), 46 deletions(-)
>
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 47485653c2a1..6faa38b92111 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -58,17 +58,26 @@ enum madvise_lock_mode {
>  	MADVISE_VMA_READ_LOCK,
>  };
>
> +struct madvise_behavior_range {
> +	unsigned long start, end;
> +};
> +

Declare members separately?

<snip>

> @@ -1425,10 +1437,11 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
>  /*
>   * Error injection support for memory error handling.
>   */
> -static int madvise_inject_error(unsigned long start, unsigned long end,
> -		struct madvise_behavior *madv_behavior)
> +static int madvise_inject_error(struct madvise_behavior *madv_behavior)
>  {
>  	unsigned long size;
> +	unsigned long start = madv_behavior->range.start;
> +	unsigned long end = madv_behavior->range.end;
>
>  	if (!capable(CAP_SYS_ADMIN))
>  		return -EPERM;
> @@ -1482,8 +1495,7 @@ static bool is_memory_failure(struct madvise_behavior *madv_behavior)
>
>  #else
>
> -static int madvise_inject_error(unsigned long start, unsigned long end,
> -		struct madvise_behavior *madv_behavior)
> +static int madvise_inject_error(struct madvise_behavior *madv_behavior)
>  {
>  	return 0;
>  }

OK, now I get why you pass struct madvise_behavior to madvise_inject_error()
in Patch 2. The changes make sense to me now. Maybe delay that conversation
in this one.



> @@ -1565,20 +1577,20 @@ static bool process_madvise_remote_valid(int behavior)
>   * If a VMA read lock could not be acquired, we return NULL and expect caller to
>   * fallback to mmap lock behaviour.
>   */
> -static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm,
> -		struct madvise_behavior *madv_behavior,
> -		unsigned long start, unsigned long end)
> +static
> +struct vm_area_struct *try_vma_read_lock(struct madvise_behavior *madv_behavior)
>  {
> +	struct mm_struct *mm = madv_behavior->mm;

Is the struct mm_struct removal missed in Patch 2?


<snip>

> @@ -1846,22 +1854,23 @@ static int madvise_do_behavior(unsigned long start, size_t len_in,
>  		struct madvise_behavior *madv_behavior)
>  {
>  	struct blk_plug plug;
> -	unsigned long end;
>  	int error;
> +	struct madvise_behavior_range *range = &madv_behavior->range;
>
>  	if (is_memory_failure(madv_behavior)) {
> -		end = start + len_in;
> -		return madvise_inject_error(start, end, madv_behavior);
> +		range->start = start;
> +		range->end = start + len_in;
> +		return madvise_inject_error(madv_behavior);
>  	}
>
> -	start = get_untagged_addr(madv_behavior->mm, start);
> -	end = start + PAGE_ALIGN(len_in);
> +	range->start = get_untagged_addr(madv_behavior->mm, start);
> +	range->end = range->start + PAGE_ALIGN(len_in);
>
>  	blk_start_plug(&plug);
>  	if (is_madvise_populate(madv_behavior))
> -		error = madvise_populate(start, end, madv_behavior);
> +		error = madvise_populate(madv_behavior);
>  	else
> -		error = madvise_walk_vmas(start, end, madv_behavior);
> +		error = madvise_walk_vmas(madv_behavior);
>  	blk_finish_plug(&plug);
>  	return error;
>  }

We almost can pass just struct madvise_behavior to madvise_do_behavior().
I wonder why memory_failure behaves differently.

--
Best Regards,
Yan, Zi
Re: [PATCH 3/5] mm/madvise: thread VMA range state through madvise_behavior
Posted by Lorenzo Stoakes 3 months, 2 weeks ago
On Thu, Jun 19, 2025 at 09:54:11PM -0400, Zi Yan wrote:
> On 19 Jun 2025, at 16:26, Lorenzo Stoakes wrote:
>
> > Rather than updating start and a confusing local parameter 'tmp' in
> > madvise_walk_vmas(), instead store the current range being operated upon in
> > the struct madvise_behavior helper object in a range pair and use this
> > consistently in all operations.
> >
> > This makes it clearer what is going on and opens the door to further
> > cleanup now we store state regarding what is currently being operated upon
> > here.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> >  mm/madvise.c | 101 ++++++++++++++++++++++++++++-----------------------
> >  1 file changed, 55 insertions(+), 46 deletions(-)
> >
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 47485653c2a1..6faa38b92111 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -58,17 +58,26 @@ enum madvise_lock_mode {
> >  	MADVISE_VMA_READ_LOCK,
> >  };
> >
> > +struct madvise_behavior_range {
> > +	unsigned long start, end;
> > +};
> > +
>
> Declare members separately?

Can do, but this is one of those subject things where everyone has different
views, if I did it the other way no doubt somebody else would comment about
declaring together :P

I think as a range here it's not a big deal unless you feel strongly about it?

>
> <snip>
>
> > @@ -1425,10 +1437,11 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
> >  /*
> >   * Error injection support for memory error handling.
> >   */
> > -static int madvise_inject_error(unsigned long start, unsigned long end,
> > -		struct madvise_behavior *madv_behavior)
> > +static int madvise_inject_error(struct madvise_behavior *madv_behavior)
> >  {
> >  	unsigned long size;
> > +	unsigned long start = madv_behavior->range.start;
> > +	unsigned long end = madv_behavior->range.end;
> >
> >  	if (!capable(CAP_SYS_ADMIN))
> >  		return -EPERM;
> > @@ -1482,8 +1495,7 @@ static bool is_memory_failure(struct madvise_behavior *madv_behavior)
> >
> >  #else
> >
> > -static int madvise_inject_error(unsigned long start, unsigned long end,
> > -		struct madvise_behavior *madv_behavior)
> > +static int madvise_inject_error(struct madvise_behavior *madv_behavior)
> >  {
> >  	return 0;
> >  }
>
> OK, now I get why you pass struct madvise_behavior to madvise_inject_error()
> in Patch 2. The changes make sense to me now. Maybe delay that conversation
> in this one.

I think it's valuable there because otherwise all the function invocations were
inconsistent, but after 2/5 become completely consistent. I mention this in the
commit message and I think it's valuable so you're not doing:

if (foo)
	bar(x, y, z)

if (blah)
	baz(y, x, z)

etc.

When you quickly read through it's easy to get confused/lost as to what's going
on, whereas if they all have the same signatures it's very clear you're
offloading the heavy lifting to each function.

>
>
>
> > @@ -1565,20 +1577,20 @@ static bool process_madvise_remote_valid(int behavior)
> >   * If a VMA read lock could not be acquired, we return NULL and expect caller to
> >   * fallback to mmap lock behaviour.
> >   */
> > -static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm,
> > -		struct madvise_behavior *madv_behavior,
> > -		unsigned long start, unsigned long end)
> > +static
> > +struct vm_area_struct *try_vma_read_lock(struct madvise_behavior *madv_behavior)
> >  {
> > +	struct mm_struct *mm = madv_behavior->mm;
>
> Is the struct mm_struct removal missed in Patch 2?

Yeah, I will go back and put it in on respin.

>
>
> <snip>
>
> > @@ -1846,22 +1854,23 @@ static int madvise_do_behavior(unsigned long start, size_t len_in,
> >  		struct madvise_behavior *madv_behavior)
> >  {
> >  	struct blk_plug plug;
> > -	unsigned long end;
> >  	int error;
> > +	struct madvise_behavior_range *range = &madv_behavior->range;
> >
> >  	if (is_memory_failure(madv_behavior)) {
> > -		end = start + len_in;
> > -		return madvise_inject_error(start, end, madv_behavior);
> > +		range->start = start;
> > +		range->end = start + len_in;
> > +		return madvise_inject_error(madv_behavior);
> >  	}
> >
> > -	start = get_untagged_addr(madv_behavior->mm, start);
> > -	end = start + PAGE_ALIGN(len_in);
> > +	range->start = get_untagged_addr(madv_behavior->mm, start);
> > +	range->end = range->start + PAGE_ALIGN(len_in);
> >
> >  	blk_start_plug(&plug);
> >  	if (is_madvise_populate(madv_behavior))
> > -		error = madvise_populate(start, end, madv_behavior);
> > +		error = madvise_populate(madv_behavior);
> >  	else
> > -		error = madvise_walk_vmas(start, end, madv_behavior);
> > +		error = madvise_walk_vmas(madv_behavior);
> >  	blk_finish_plug(&plug);
> >  	return error;
> >  }
>
> We almost can pass just struct madvise_behavior to madvise_do_behavior().
> I wonder why memory_failure behaves differently.

There's complexity around the start, end stuff (Barry bumped into some of this)
and I don't want to mess with that in this series. This series is meant to have
no functional changes.

>
> --
> Best Regards,
> Yan, Zi
Re: [PATCH 3/5] mm/madvise: thread VMA range state through madvise_behavior
Posted by Zi Yan 3 months, 3 weeks ago
On 19 Jun 2025, at 21:54, Zi Yan wrote:

> On 19 Jun 2025, at 16:26, Lorenzo Stoakes wrote:
>
>> Rather than updating start and a confusing local parameter 'tmp' in
>> madvise_walk_vmas(), instead store the current range being operated upon in
>> the struct madvise_behavior helper object in a range pair and use this
>> consistently in all operations.
>>
>> This makes it clearer what is going on and opens the door to further
>> cleanup now we store state regarding what is currently being operated upon
>> here.
>>
>> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>> ---
>>  mm/madvise.c | 101 ++++++++++++++++++++++++++++-----------------------
>>  1 file changed, 55 insertions(+), 46 deletions(-)
>>
>> diff --git a/mm/madvise.c b/mm/madvise.c
>> index 47485653c2a1..6faa38b92111 100644
>> --- a/mm/madvise.c
>> +++ b/mm/madvise.c
>> @@ -58,17 +58,26 @@ enum madvise_lock_mode {
>>  	MADVISE_VMA_READ_LOCK,
>>  };
>>
>> +struct madvise_behavior_range {
>> +	unsigned long start, end;
>> +};
>> +
>
> Declare members separately?
>
> <snip>
>
>> @@ -1425,10 +1437,11 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
>>  /*
>>   * Error injection support for memory error handling.
>>   */
>> -static int madvise_inject_error(unsigned long start, unsigned long end,
>> -		struct madvise_behavior *madv_behavior)
>> +static int madvise_inject_error(struct madvise_behavior *madv_behavior)
>>  {
>>  	unsigned long size;
>> +	unsigned long start = madv_behavior->range.start;
>> +	unsigned long end = madv_behavior->range.end;
>>
>>  	if (!capable(CAP_SYS_ADMIN))
>>  		return -EPERM;
>> @@ -1482,8 +1495,7 @@ static bool is_memory_failure(struct madvise_behavior *madv_behavior)
>>
>>  #else
>>
>> -static int madvise_inject_error(unsigned long start, unsigned long end,
>> -		struct madvise_behavior *madv_behavior)
>> +static int madvise_inject_error(struct madvise_behavior *madv_behavior)
>>  {
>>  	return 0;
>>  }
>
> OK, now I get why you pass struct madvise_behavior to madvise_inject_error()
> in Patch 2. The changes make sense to me now. Maybe delay that conversation
> in this one.
>
>
>
>> @@ -1565,20 +1577,20 @@ static bool process_madvise_remote_valid(int behavior)
>>   * If a VMA read lock could not be acquired, we return NULL and expect caller to
>>   * fallback to mmap lock behaviour.
>>   */
>> -static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm,
>> -		struct madvise_behavior *madv_behavior,
>> -		unsigned long start, unsigned long end)
>> +static
>> +struct vm_area_struct *try_vma_read_lock(struct madvise_behavior *madv_behavior)
>>  {
>> +	struct mm_struct *mm = madv_behavior->mm;
>
> Is the struct mm_struct removal missed in Patch 2?
>
>
> <snip>
>
>> @@ -1846,22 +1854,23 @@ static int madvise_do_behavior(unsigned long start, size_t len_in,
>>  		struct madvise_behavior *madv_behavior)
>>  {
>>  	struct blk_plug plug;
>> -	unsigned long end;
>>  	int error;
>> +	struct madvise_behavior_range *range = &madv_behavior->range;
>>
>>  	if (is_memory_failure(madv_behavior)) {
>> -		end = start + len_in;
>> -		return madvise_inject_error(start, end, madv_behavior);
>> +		range->start = start;
>> +		range->end = start + len_in;
>> +		return madvise_inject_error(madv_behavior);
>>  	}
>>
>> -	start = get_untagged_addr(madv_behavior->mm, start);
>> -	end = start + PAGE_ALIGN(len_in);
>> +	range->start = get_untagged_addr(madv_behavior->mm, start);
>> +	range->end = range->start + PAGE_ALIGN(len_in);
>>
>>  	blk_start_plug(&plug);
>>  	if (is_madvise_populate(madv_behavior))
>> -		error = madvise_populate(start, end, madv_behavior);
>> +		error = madvise_populate(madv_behavior);
>>  	else
>> -		error = madvise_walk_vmas(start, end, madv_behavior);
>> +		error = madvise_walk_vmas(madv_behavior);
>>  	blk_finish_plug(&plug);
>>  	return error;
>>  }
>
> We almost can pass just struct madvise_behavior to madvise_do_behavior().
> I wonder why memory_failure behaves differently.

Based on git history, it seems that no one paid attention to
madvise_inject_error() and the [start, start + len_in] has never been
changed since it was added back from 2009.

OK, it seems that Kirill (cc'd) moved start = untagged_addr(start); from
before madvise_inject_error() to after it at commit 428e106ae1ad
("mm: Introduce untagged_addr_remote()"). It changed code behavior.

So memory_failure should get the same range as others, meaning
madvise_do_behavior() can just take struct madvise_behavior
and the range can be set at the call sites.

--
Best Regards,
Yan, Zi
Re: [PATCH 3/5] mm/madvise: thread VMA range state through madvise_behavior
Posted by Lorenzo Stoakes 3 months, 2 weeks ago
On Thu, Jun 19, 2025 at 10:13:19PM -0400, Zi Yan wrote:
> On 19 Jun 2025, at 21:54, Zi Yan wrote:
> >> @@ -1846,22 +1854,23 @@ static int madvise_do_behavior(unsigned long start, size_t len_in,
> >>  		struct madvise_behavior *madv_behavior)
> >>  {
> >>  	struct blk_plug plug;
> >> -	unsigned long end;
> >>  	int error;
> >> +	struct madvise_behavior_range *range = &madv_behavior->range;
> >>
> >>  	if (is_memory_failure(madv_behavior)) {
> >> -		end = start + len_in;
> >> -		return madvise_inject_error(start, end, madv_behavior);
> >> +		range->start = start;
> >> +		range->end = start + len_in;
> >> +		return madvise_inject_error(madv_behavior);
> >>  	}
> >>
> >> -	start = get_untagged_addr(madv_behavior->mm, start);
> >> -	end = start + PAGE_ALIGN(len_in);
> >> +	range->start = get_untagged_addr(madv_behavior->mm, start);
> >> +	range->end = range->start + PAGE_ALIGN(len_in);
> >>
> >>  	blk_start_plug(&plug);
> >>  	if (is_madvise_populate(madv_behavior))
> >> -		error = madvise_populate(start, end, madv_behavior);
> >> +		error = madvise_populate(madv_behavior);
> >>  	else
> >> -		error = madvise_walk_vmas(start, end, madv_behavior);
> >> +		error = madvise_walk_vmas(madv_behavior);
> >>  	blk_finish_plug(&plug);
> >>  	return error;
> >>  }
> >
> > We almost can pass just struct madvise_behavior to madvise_do_behavior().
> > I wonder why memory_failure behaves differently.
>
> Based on git history, it seems that no one paid attention to
> madvise_inject_error() and the [start, start + len_in] has never been
> changed since it was added back from 2009.
>
> OK, it seems that Kirill (cc'd) moved start = untagged_addr(start); from
> before madvise_inject_error() to after it at commit 428e106ae1ad
> ("mm: Introduce untagged_addr_remote()"). It changed code behavior.
>
> So memory_failure should get the same range as others, meaning
> madvise_do_behavior() can just take struct madvise_behavior
> and the range can be set at the call sites.

Well it's also page aligned in other cases right?

Anyway overall I'm not touching that in this series, sorry. This is a clean
up, and this stuff is a functional change that needs to be carefully
thought out.

So this is legitimately I think one for a follow-up. But this series makes
it easier to fix up later :)