[PATCH 2/5] mm/madvise: thread mm_struct through madvise_behavior

Lorenzo Stoakes posted 5 patches 3 months, 3 weeks ago
There is a newer version of this series
[PATCH 2/5] mm/madvise: thread mm_struct through madvise_behavior
Posted by Lorenzo Stoakes 3 months, 3 weeks ago
There's no need to thread a pointer to the mm_struct nor have different
functions signatures for each behaviour, instead store state in the struct
madvise_behavior object consistently and use it for all madvise() actions.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
 mm/madvise.c | 105 ++++++++++++++++++++++++++-------------------------
 1 file changed, 54 insertions(+), 51 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 9dd935d64692..47485653c2a1 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -59,6 +59,7 @@ enum madvise_lock_mode {
 };
 
 struct madvise_behavior {
+	struct mm_struct *mm;
 	int behavior;
 	struct mmu_gather *tlb;
 	enum madvise_lock_mode lock_mode;
@@ -66,8 +67,8 @@ struct madvise_behavior {
 };
 
 #ifdef CONFIG_ANON_VMA_NAME
-static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
-		unsigned long end, struct madvise_behavior *madv_behavior);
+static int madvise_walk_vmas(unsigned long start, unsigned long end,
+		struct madvise_behavior *madv_behavior);
 
 struct anon_vma_name *anon_vma_name_alloc(const char *name)
 {
@@ -126,6 +127,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 	unsigned long end;
 	unsigned long len;
 	struct madvise_behavior madv_behavior = {
+		.mm = mm,
 		.lock_mode = MADVISE_MMAP_WRITE_LOCK,
 		.anon_name = anon_name,
 	};
@@ -148,7 +150,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 	madv_behavior.behavior =
 		anon_name ? __MADV_SET_ANON_VMA_NAME : __MADV_CLEAR_ANON_VMA_NAME;
 
-	return madvise_walk_vmas(mm, start, end, &madv_behavior);
+	return madvise_walk_vmas(start, end, &madv_behavior);
 }
 
 static bool is_anon_vma_name(int behavior)
@@ -1010,10 +1012,11 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
 		return -EINVAL;
 }
 
-static long madvise_populate(struct mm_struct *mm, unsigned long start,
-		unsigned long end, int behavior)
+static long madvise_populate(unsigned long start, unsigned long end,
+		struct madvise_behavior *madv_behavior)
 {
-	const bool write = behavior == MADV_POPULATE_WRITE;
+	struct mm_struct *mm = madv_behavior->mm;
+	const bool write = madv_behavior->behavior == MADV_POPULATE_WRITE;
 	int locked = 1;
 	long pages;
 
@@ -1422,15 +1425,14 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
 /*
  * Error injection support for memory error handling.
  */
-static int madvise_inject_error(int behavior,
-		unsigned long start, unsigned long end)
+static int madvise_inject_error(unsigned long start, unsigned long end,
+		struct madvise_behavior *madv_behavior)
 {
 	unsigned long size;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-
 	for (; start < end; start += size) {
 		unsigned long pfn;
 		struct page *page;
@@ -1448,7 +1450,7 @@ static int madvise_inject_error(int behavior,
 		 */
 		size = page_size(compound_head(page));
 
-		if (behavior == MADV_SOFT_OFFLINE) {
+		if (madv_behavior->behavior == MADV_SOFT_OFFLINE) {
 			pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
 				 pfn, start);
 			ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
@@ -1467,9 +1469,9 @@ static int madvise_inject_error(int behavior,
 	return 0;
 }
 
-static bool is_memory_failure(int behavior)
+static bool is_memory_failure(struct madvise_behavior *madv_behavior)
 {
-	switch (behavior) {
+	switch (madv_behavior->behavior) {
 	case MADV_HWPOISON:
 	case MADV_SOFT_OFFLINE:
 		return true;
@@ -1480,13 +1482,13 @@ static bool is_memory_failure(int behavior)
 
 #else
 
-static int madvise_inject_error(int behavior,
-		unsigned long start, unsigned long end)
+static int madvise_inject_error(unsigned long start, unsigned long end,
+		struct madvise_behavior *madv_behavior)
 {
 	return 0;
 }
 
-static bool is_memory_failure(int behavior)
+static bool is_memory_failure(struct madvise_behavior *madv_behavior)
 {
 	return false;
 }
@@ -1598,9 +1600,10 @@ static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm,
  * Must be called with the mmap_lock held for reading or writing.
  */
 static
-int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
-		      unsigned long end, struct madvise_behavior *madv_behavior)
+int madvise_walk_vmas(unsigned long start, unsigned long end,
+		      struct madvise_behavior *madv_behavior)
 {
+	struct mm_struct *mm = madv_behavior->mm;
 	struct vm_area_struct *vma;
 	struct vm_area_struct *prev;
 	unsigned long tmp;
@@ -1675,12 +1678,10 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
  */
 static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior)
 {
-	int behavior = madv_behavior->behavior;
-
-	if (is_memory_failure(behavior))
+	if (is_memory_failure(madv_behavior))
 		return MADVISE_NO_LOCK;
 
-	switch (behavior) {
+	switch (madv_behavior->behavior) {
 	case MADV_REMOVE:
 	case MADV_WILLNEED:
 	case MADV_COLD:
@@ -1700,9 +1701,9 @@ static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavi
 	}
 }
 
-static int madvise_lock(struct mm_struct *mm,
-		struct madvise_behavior *madv_behavior)
+static int madvise_lock(struct madvise_behavior *madv_behavior)
 {
+	struct mm_struct *mm = madv_behavior->mm;
 	enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior);
 
 	switch (lock_mode) {
@@ -1724,9 +1725,10 @@ static int madvise_lock(struct mm_struct *mm,
 	return 0;
 }
 
-static void madvise_unlock(struct mm_struct *mm,
-		struct madvise_behavior *madv_behavior)
+static void madvise_unlock(struct madvise_behavior *madv_behavior)
 {
+	struct mm_struct *mm = madv_behavior->mm;
+
 	switch (madv_behavior->lock_mode) {
 	case  MADVISE_NO_LOCK:
 		return;
@@ -1756,11 +1758,10 @@ static bool madvise_batch_tlb_flush(int behavior)
 	}
 }
 
-static void madvise_init_tlb(struct madvise_behavior *madv_behavior,
-		struct mm_struct *mm)
+static void madvise_init_tlb(struct madvise_behavior *madv_behavior)
 {
 	if (madvise_batch_tlb_flush(madv_behavior->behavior))
-		tlb_gather_mmu(madv_behavior->tlb, mm);
+		tlb_gather_mmu(madv_behavior->tlb, madv_behavior->mm);
 }
 
 static void madvise_finish_tlb(struct madvise_behavior *madv_behavior)
@@ -1815,9 +1816,9 @@ static bool madvise_should_skip(unsigned long start, size_t len_in,
 	return false;
 }
 
-static bool is_madvise_populate(int behavior)
+static bool is_madvise_populate(struct madvise_behavior *madv_behavior)
 {
-	switch (behavior) {
+	switch (madv_behavior->behavior) {
 	case MADV_POPULATE_READ:
 	case MADV_POPULATE_WRITE:
 		return true;
@@ -1841,25 +1842,26 @@ static inline unsigned long get_untagged_addr(struct mm_struct *mm,
 				   untagged_addr_remote(mm, start);
 }
 
-static int madvise_do_behavior(struct mm_struct *mm,
-		unsigned long start, size_t len_in,
+static int madvise_do_behavior(unsigned long start, size_t len_in,
 		struct madvise_behavior *madv_behavior)
 {
-	int behavior = madv_behavior->behavior;
 	struct blk_plug plug;
 	unsigned long end;
 	int error;
 
-	if (is_memory_failure(behavior))
-		return madvise_inject_error(behavior, start, start + len_in);
-	start = get_untagged_addr(mm, start);
+	if (is_memory_failure(madv_behavior)) {
+		end = start + len_in;
+		return madvise_inject_error(start, end, madv_behavior);
+	}
+
+	start = get_untagged_addr(madv_behavior->mm, start);
 	end = start + PAGE_ALIGN(len_in);
 
 	blk_start_plug(&plug);
-	if (is_madvise_populate(behavior))
-		error = madvise_populate(mm, start, end, behavior);
+	if (is_madvise_populate(madv_behavior))
+		error = madvise_populate(start, end, madv_behavior);
 	else
-		error = madvise_walk_vmas(mm, start, end, madv_behavior);
+		error = madvise_walk_vmas(start, end, madv_behavior);
 	blk_finish_plug(&plug);
 	return error;
 }
@@ -1941,19 +1943,20 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 	int error;
 	struct mmu_gather tlb;
 	struct madvise_behavior madv_behavior = {
+		.mm = mm,
 		.behavior = behavior,
 		.tlb = &tlb,
 	};
 
 	if (madvise_should_skip(start, len_in, behavior, &error))
 		return error;
-	error = madvise_lock(mm, &madv_behavior);
+	error = madvise_lock(&madv_behavior);
 	if (error)
 		return error;
-	madvise_init_tlb(&madv_behavior, mm);
-	error = madvise_do_behavior(mm, start, len_in, &madv_behavior);
+	madvise_init_tlb(&madv_behavior);
+	error = madvise_do_behavior(start, len_in, &madv_behavior);
 	madvise_finish_tlb(&madv_behavior);
-	madvise_unlock(mm, &madv_behavior);
+	madvise_unlock(&madv_behavior);
 
 	return error;
 }
@@ -1971,16 +1974,17 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
 	size_t total_len;
 	struct mmu_gather tlb;
 	struct madvise_behavior madv_behavior = {
+		.mm = mm,
 		.behavior = behavior,
 		.tlb = &tlb,
 	};
 
 	total_len = iov_iter_count(iter);
 
-	ret = madvise_lock(mm, &madv_behavior);
+	ret = madvise_lock(&madv_behavior);
 	if (ret)
 		return ret;
-	madvise_init_tlb(&madv_behavior, mm);
+	madvise_init_tlb(&madv_behavior);
 
 	while (iov_iter_count(iter)) {
 		unsigned long start = (unsigned long)iter_iov_addr(iter);
@@ -1990,8 +1994,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
 		if (madvise_should_skip(start, len_in, behavior, &error))
 			ret = error;
 		else
-			ret = madvise_do_behavior(mm, start, len_in,
-					&madv_behavior);
+			ret = madvise_do_behavior(start, len_in, &madv_behavior);
 		/*
 		 * An madvise operation is attempting to restart the syscall,
 		 * but we cannot proceed as it would not be correct to repeat
@@ -2010,11 +2013,11 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
 
 			/* Drop and reacquire lock to unwind race. */
 			madvise_finish_tlb(&madv_behavior);
-			madvise_unlock(mm, &madv_behavior);
-			ret = madvise_lock(mm, &madv_behavior);
+			madvise_unlock(&madv_behavior);
+			ret = madvise_lock(&madv_behavior);
 			if (ret)
 				goto out;
-			madvise_init_tlb(&madv_behavior, mm);
+			madvise_init_tlb(&madv_behavior);
 			continue;
 		}
 		if (ret < 0)
@@ -2022,7 +2025,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
 		iov_iter_advance(iter, iter_iov_len(iter));
 	}
 	madvise_finish_tlb(&madv_behavior);
-	madvise_unlock(mm, &madv_behavior);
+	madvise_unlock(&madv_behavior);
 
 out:
 	ret = (total_len - iov_iter_count(iter)) ? : ret;
-- 
2.49.0
Re: [PATCH 2/5] mm/madvise: thread mm_struct through madvise_behavior
Posted by Vlastimil Babka 3 months, 2 weeks ago
On 6/19/25 22:26, Lorenzo Stoakes wrote:
> There's no need to thread a pointer to the mm_struct nor have different
> functions signatures for each behaviour, instead store state in the struct
> madvise_behavior object consistently and use it for all madvise() actions.
> 
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>

Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Re: [PATCH 2/5] mm/madvise: thread mm_struct through madvise_behavior
Posted by Zi Yan 3 months, 3 weeks ago
On 19 Jun 2025, at 16:26, Lorenzo Stoakes wrote:

> There's no need to thread a pointer to the mm_struct nor have different
> functions signatures for each behaviour, instead store state in the struct
> madvise_behavior object consistently and use it for all madvise() actions.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
>  mm/madvise.c | 105 ++++++++++++++++++++++++++-------------------------
>  1 file changed, 54 insertions(+), 51 deletions(-)
>

<snip>

> @@ -1422,15 +1425,14 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
>  /*
>   * Error injection support for memory error handling.
>   */
> -static int madvise_inject_error(int behavior,
> -		unsigned long start, unsigned long end)
> +static int madvise_inject_error(unsigned long start, unsigned long end,
> +		struct madvise_behavior *madv_behavior)
>  {
>  	unsigned long size;
>
>  	if (!capable(CAP_SYS_ADMIN))
>  		return -EPERM;
>
> -
>  	for (; start < end; start += size) {
>  		unsigned long pfn;
>  		struct page *page;
> @@ -1448,7 +1450,7 @@ static int madvise_inject_error(int behavior,
>  		 */
>  		size = page_size(compound_head(page));
>
> -		if (behavior == MADV_SOFT_OFFLINE) {
> +		if (madv_behavior->behavior == MADV_SOFT_OFFLINE) {
>  			pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
>  				 pfn, start);
>  			ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
> @@ -1467,9 +1469,9 @@ static int madvise_inject_error(int behavior,
>  	return 0;
>  }
>

Is this necessary? madvise_inject_error() only cares about behavior.

> -static bool is_memory_failure(int behavior)
> +static bool is_memory_failure(struct madvise_behavior *madv_behavior)
>  {
> -	switch (behavior) {
> +	switch (madv_behavior->behavior) {
>  	case MADV_HWPOISON:
>  	case MADV_SOFT_OFFLINE:
>  		return true;
> @@ -1480,13 +1482,13 @@ static bool is_memory_failure(int behavior)
>
>  #else
>
> -static int madvise_inject_error(int behavior,
> -		unsigned long start, unsigned long end)
> +static int madvise_inject_error(unsigned long start, unsigned long end,
> +		struct madvise_behavior *madv_behavior)
>  {
>  	return 0;
>  }
>
> -static bool is_memory_failure(int behavior)
> +static bool is_memory_failure(struct madvise_behavior *madv_behavior)
>  {
>  	return false;
>  }

Same here. Your is_anon_vma_name() still takes int behavior, why
would is_memory_failure() take struct madvise_behavior?

<snip>

> -static bool is_madvise_populate(int behavior)
> +static bool is_madvise_populate(struct madvise_behavior *madv_behavior)
>  {
> -	switch (behavior) {
> +	switch (madv_behavior->behavior) {
>  	case MADV_POPULATE_READ:
>  	case MADV_POPULATE_WRITE:
>  		return true;

Ditto.

The rest looks good to me.

--
Best Regards,
Yan, Zi
Re: [PATCH 2/5] mm/madvise: thread mm_struct through madvise_behavior
Posted by Lorenzo Stoakes 3 months, 2 weeks ago
On Thu, Jun 19, 2025 at 09:40:34PM -0400, Zi Yan wrote:
> On 19 Jun 2025, at 16:26, Lorenzo Stoakes wrote:
>
> > There's no need to thread a pointer to the mm_struct nor have different
> > functions signatures for each behaviour, instead store state in the struct
> > madvise_behavior object consistently and use it for all madvise() actions.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> >  mm/madvise.c | 105 ++++++++++++++++++++++++++-------------------------
> >  1 file changed, 54 insertions(+), 51 deletions(-)
> >
>
> <snip>
>
> > @@ -1422,15 +1425,14 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
> >  /*
> >   * Error injection support for memory error handling.
> >   */
> > -static int madvise_inject_error(int behavior,
> > -		unsigned long start, unsigned long end)
> > +static int madvise_inject_error(unsigned long start, unsigned long end,
> > +		struct madvise_behavior *madv_behavior)
> >  {
> >  	unsigned long size;
> >
> >  	if (!capable(CAP_SYS_ADMIN))
> >  		return -EPERM;
> >
> > -
> >  	for (; start < end; start += size) {
> >  		unsigned long pfn;
> >  		struct page *page;
> > @@ -1448,7 +1450,7 @@ static int madvise_inject_error(int behavior,
> >  		 */
> >  		size = page_size(compound_head(page));
> >
> > -		if (behavior == MADV_SOFT_OFFLINE) {
> > +		if (madv_behavior->behavior == MADV_SOFT_OFFLINE) {
> >  			pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
> >  				 pfn, start);
> >  			ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
> > @@ -1467,9 +1469,9 @@ static int madvise_inject_error(int behavior,
> >  	return 0;
> >  }
> >
>
> Is this necessary? madvise_inject_error() only cares about behavior.

As you notice in a subsequent patch, we go further. It's also useful
signature-wise for all functions to have the exact same signature for
consistency, something sorely lacking in madvise.c for some time.

This is covered under the 'nor have different functions signatures for each
behaviour' bit in the commit message :)

>
> > -static bool is_memory_failure(int behavior)
> > +static bool is_memory_failure(struct madvise_behavior *madv_behavior)
> >  {
> > -	switch (behavior) {
> > +	switch (madv_behavior->behavior) {
> >  	case MADV_HWPOISON:
> >  	case MADV_SOFT_OFFLINE:
> >  		return true;
> > @@ -1480,13 +1482,13 @@ static bool is_memory_failure(int behavior)
> >
> >  #else
> >
> > -static int madvise_inject_error(int behavior,
> > -		unsigned long start, unsigned long end)
> > +static int madvise_inject_error(unsigned long start, unsigned long end,
> > +		struct madvise_behavior *madv_behavior)
> >  {
> >  	return 0;
> >  }
> >
> > -static bool is_memory_failure(int behavior)
> > +static bool is_memory_failure(struct madvise_behavior *madv_behavior)
> >  {
> >  	return false;
> >  }
>
> Same here. Your is_anon_vma_name() still takes int behavior, why
> would is_memory_failure() take struct madvise_behavior?

See above.

>
> <snip>
>
> > -static bool is_madvise_populate(int behavior)
> > +static bool is_madvise_populate(struct madvise_behavior *madv_behavior)
> >  {
> > -	switch (behavior) {
> > +	switch (madv_behavior->behavior) {
> >  	case MADV_POPULATE_READ:
> >  	case MADV_POPULATE_WRITE:
> >  		return true;
>
> Ditto.

Doing it this way makes life easier (esp later) and is more consistent with
other invocations.

>
> The rest looks good to me.

Thanks!

>
> --
> Best Regards,
> Yan, Zi