[PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c

Lorenzo Stoakes posted 5 patches 2 months, 3 weeks ago
There is a newer version of this series
[PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c
Posted by Lorenzo Stoakes 2 months, 3 weeks ago
The madvise() logic is inexplicably performed in mm/mseal.c - this ought to
be located in mm/madvise.c.

Additionally can_modify_vma_madv() is inconsistently named and, in
combination with is_ro_anon(), is very confusing logic.

Put a static function in mm/madvise.c instead - can_madvise_modify() - that
spells out exactly what's happening. Also explicitly check for an anon VMA.

Additionally add commentary to explain what's going on.

No functional change intended.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
 mm/madvise.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 mm/mseal.c   | 49 -----------------------------------------
 mm/vma.h     |  7 ------
 3 files changed, 61 insertions(+), 57 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 9de9b7c797c6..75757ba418a8 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
 #include <linux/mm_inline.h>
+#include <linux/mmu_context.h>
 #include <linux/string.h>
 #include <linux/uio.h>
 #include <linux/ksm.h>
@@ -1256,6 +1257,65 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
 			       &guard_remove_walk_ops, NULL);
 }

+#ifdef CONFIG_64BIT
+/* Does the madvise operation result in discarding of mapped data? */
+static bool is_discard(int behavior)
+{
+	switch (behavior) {
+	case MADV_FREE:
+	case MADV_DONTNEED:
+	case MADV_DONTNEED_LOCKED:
+	case MADV_REMOVE:
+	case MADV_DONTFORK:
+	case MADV_WIPEONFORK:
+	case MADV_GUARD_INSTALL:
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * We are restricted from madvise()'ing mseal()'d VMAs only in very particular
+ * circumstances - discarding of data from read-only anonymous SEALED mappings.
+ *
+ * This is because users cannot trivally discard data from these VMAs, and may
+ * only do so via an appropriate madvise() call.
+ */
+static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
+{
+	struct vm_area_struct *vma = madv_behavior->vma;
+
+	/* If the operation won't discard, we're good. */
+	if (!is_discard(madv_behavior->behavior))
+		return true;
+
+	/* If the VMA isn't sealed we're also good. */
+	if (can_modify_vma(vma))
+		return true;
+
+	/* File-backed mappings don't lose data on discard. */
+	if (!vma_is_anonymous(vma))
+		return true;
+
+	/*
+	 * If the VMA is writable and the architecture permits write access,
+	 * then discarding is fine.
+	 */
+	if ((vma->vm_flags & VM_WRITE) &&
+	    arch_vma_access_permitted(vma, /* write= */ true,
+			/* execute= */ false, /* foreign= */ false))
+		return true;
+
+	return false;
+}
+#else
+static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
+{
+	return true;
+}
+#endif
+
 /*
  * Apply an madvise behavior to a region of a vma.  madvise_update_vma
  * will handle splitting a vm area into separate areas, each area with its own
@@ -1269,7 +1329,7 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
 	struct madvise_behavior_range *range = &madv_behavior->range;
 	int error;

-	if (unlikely(!can_modify_vma_madv(madv_behavior->vma, behavior)))
+	if (unlikely(!can_madvise_modify(madv_behavior)))
 		return -EPERM;

 	switch (behavior) {
diff --git a/mm/mseal.c b/mm/mseal.c
index c27197ac04e8..1308e88ab184 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -11,7 +11,6 @@
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/mm_inline.h>
-#include <linux/mmu_context.h>
 #include <linux/syscalls.h>
 #include <linux/sched.h>
 #include "internal.h"
@@ -21,54 +20,6 @@ static inline void set_vma_sealed(struct vm_area_struct *vma)
 	vm_flags_set(vma, VM_SEALED);
 }

-static bool is_madv_discard(int behavior)
-{
-	switch (behavior) {
-	case MADV_FREE:
-	case MADV_DONTNEED:
-	case MADV_DONTNEED_LOCKED:
-	case MADV_REMOVE:
-	case MADV_DONTFORK:
-	case MADV_WIPEONFORK:
-	case MADV_GUARD_INSTALL:
-		return true;
-	}
-
-	return false;
-}
-
-static bool is_ro_anon(struct vm_area_struct *vma)
-{
-	/* check anonymous mapping. */
-	if (vma->vm_file || vma->vm_flags & VM_SHARED)
-		return false;
-
-	/*
-	 * check for non-writable:
-	 * PROT=RO or PKRU is not writeable.
-	 */
-	if (!(vma->vm_flags & VM_WRITE) ||
-		!arch_vma_access_permitted(vma, true, false, false))
-		return true;
-
-	return false;
-}
-
-/*
- * Check if a vma is allowed to be modified by madvise.
- */
-bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
-{
-	if (!is_madv_discard(behavior))
-		return true;
-
-	if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
-		return false;
-
-	/* Allow by default. */
-	return true;
-}
-
 static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		struct vm_area_struct **prev, unsigned long start,
 		unsigned long end, vm_flags_t newflags)
diff --git a/mm/vma.h b/mm/vma.h
index cf6e3a6371b6..6515045ba342 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -578,8 +578,6 @@ static inline bool can_modify_vma(struct vm_area_struct *vma)
 	return true;
 }

-bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior);
-
 #else

 static inline bool can_modify_vma(struct vm_area_struct *vma)
@@ -587,11 +585,6 @@ static inline bool can_modify_vma(struct vm_area_struct *vma)
 	return true;
 }

-static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
-{
-	return true;
-}
-
 #endif

 #if defined(CONFIG_STACK_GROWSUP)
--
2.50.1
Re: [PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c
Posted by Liam R. Howlett 2 months, 3 weeks ago
* Lorenzo Stoakes <lorenzo.stoakes@oracle.com> [250714 09:08]:
> The madvise() logic is inexplicably performed in mm/mseal.c - this ought to
> be located in mm/madvise.c.
> 
> Additionally can_modify_vma_madv() is inconsistently named and, in
> combination with is_ro_anon(), is very confusing logic.
> 
> Put a static function in mm/madvise.c instead - can_madvise_modify() - that
> spells out exactly what's happening. Also explicitly check for an anon VMA.
> 
> Additionally add commentary to explain what's going on.
> 
> No functional change intended.
> 
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>

Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>

> ---
>  mm/madvise.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  mm/mseal.c   | 49 -----------------------------------------
>  mm/vma.h     |  7 ------
>  3 files changed, 61 insertions(+), 57 deletions(-)
> 
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 9de9b7c797c6..75757ba418a8 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -19,6 +19,7 @@
>  #include <linux/sched.h>
>  #include <linux/sched/mm.h>
>  #include <linux/mm_inline.h>
> +#include <linux/mmu_context.h>
>  #include <linux/string.h>
>  #include <linux/uio.h>
>  #include <linux/ksm.h>
> @@ -1256,6 +1257,65 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
>  			       &guard_remove_walk_ops, NULL);
>  }
> 
> +#ifdef CONFIG_64BIT
> +/* Does the madvise operation result in discarding of mapped data? */
> +static bool is_discard(int behavior)
> +{
> +	switch (behavior) {
> +	case MADV_FREE:
> +	case MADV_DONTNEED:
> +	case MADV_DONTNEED_LOCKED:
> +	case MADV_REMOVE:
> +	case MADV_DONTFORK:
> +	case MADV_WIPEONFORK:
> +	case MADV_GUARD_INSTALL:
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +/*
> + * We are restricted from madvise()'ing mseal()'d VMAs only in very particular
> + * circumstances - discarding of data from read-only anonymous SEALED mappings.
> + *
> + * This is because users cannot trivally discard data from these VMAs, and may
> + * only do so via an appropriate madvise() call.
> + */
> +static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
> +{
> +	struct vm_area_struct *vma = madv_behavior->vma;
> +
> +	/* If the operation won't discard, we're good. */
> +	if (!is_discard(madv_behavior->behavior))
> +		return true;
> +
> +	/* If the VMA isn't sealed we're also good. */
> +	if (can_modify_vma(vma))
> +		return true;
> +
> +	/* File-backed mappings don't lose data on discard. */
> +	if (!vma_is_anonymous(vma))
> +		return true;
> +
> +	/*
> +	 * If the VMA is writable and the architecture permits write access,
> +	 * then discarding is fine.
> +	 */
> +	if ((vma->vm_flags & VM_WRITE) &&
> +	    arch_vma_access_permitted(vma, /* write= */ true,
> +			/* execute= */ false, /* foreign= */ false))
> +		return true;
> +
> +	return false;
> +}
> +#else
> +static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
> +{
> +	return true;
> +}
> +#endif
> +
>  /*
>   * Apply an madvise behavior to a region of a vma.  madvise_update_vma
>   * will handle splitting a vm area into separate areas, each area with its own
> @@ -1269,7 +1329,7 @@ static int madvise_vma_behavior(struct madvise_behavior *madv_behavior)
>  	struct madvise_behavior_range *range = &madv_behavior->range;
>  	int error;
> 
> -	if (unlikely(!can_modify_vma_madv(madv_behavior->vma, behavior)))
> +	if (unlikely(!can_madvise_modify(madv_behavior)))
>  		return -EPERM;
> 
>  	switch (behavior) {
> diff --git a/mm/mseal.c b/mm/mseal.c
> index c27197ac04e8..1308e88ab184 100644
> --- a/mm/mseal.c
> +++ b/mm/mseal.c
> @@ -11,7 +11,6 @@
>  #include <linux/mman.h>
>  #include <linux/mm.h>
>  #include <linux/mm_inline.h>
> -#include <linux/mmu_context.h>
>  #include <linux/syscalls.h>
>  #include <linux/sched.h>
>  #include "internal.h"
> @@ -21,54 +20,6 @@ static inline void set_vma_sealed(struct vm_area_struct *vma)
>  	vm_flags_set(vma, VM_SEALED);
>  }
> 
> -static bool is_madv_discard(int behavior)
> -{
> -	switch (behavior) {
> -	case MADV_FREE:
> -	case MADV_DONTNEED:
> -	case MADV_DONTNEED_LOCKED:
> -	case MADV_REMOVE:
> -	case MADV_DONTFORK:
> -	case MADV_WIPEONFORK:
> -	case MADV_GUARD_INSTALL:
> -		return true;
> -	}
> -
> -	return false;
> -}
> -
> -static bool is_ro_anon(struct vm_area_struct *vma)
> -{
> -	/* check anonymous mapping. */
> -	if (vma->vm_file || vma->vm_flags & VM_SHARED)
> -		return false;
> -
> -	/*
> -	 * check for non-writable:
> -	 * PROT=RO or PKRU is not writeable.
> -	 */
> -	if (!(vma->vm_flags & VM_WRITE) ||
> -		!arch_vma_access_permitted(vma, true, false, false))
> -		return true;
> -
> -	return false;
> -}
> -
> -/*
> - * Check if a vma is allowed to be modified by madvise.
> - */
> -bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
> -{
> -	if (!is_madv_discard(behavior))
> -		return true;
> -
> -	if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
> -		return false;
> -
> -	/* Allow by default. */
> -	return true;
> -}
> -
>  static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  		struct vm_area_struct **prev, unsigned long start,
>  		unsigned long end, vm_flags_t newflags)
> diff --git a/mm/vma.h b/mm/vma.h
> index cf6e3a6371b6..6515045ba342 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -578,8 +578,6 @@ static inline bool can_modify_vma(struct vm_area_struct *vma)
>  	return true;
>  }
> 
> -bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior);
> -
>  #else
> 
>  static inline bool can_modify_vma(struct vm_area_struct *vma)
> @@ -587,11 +585,6 @@ static inline bool can_modify_vma(struct vm_area_struct *vma)
>  	return true;
>  }
> 
> -static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
> -{
> -	return true;
> -}
> -
>  #endif
> 
>  #if defined(CONFIG_STACK_GROWSUP)
> --
> 2.50.1
Re: [PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c
Posted by Pedro Falcato 2 months, 3 weeks ago
On Mon, Jul 14, 2025 at 02:00:37PM +0100, Lorenzo Stoakes wrote:
> The madvise() logic is inexplicably performed in mm/mseal.c - this ought to
> be located in mm/madvise.c.
> 
> Additionally can_modify_vma_madv() is inconsistently named and, in
> combination with is_ro_anon(), is very confusing logic.
> 
> Put a static function in mm/madvise.c instead - can_madvise_modify() - that
> spells out exactly what's happening. Also explicitly check for an anon VMA.
> 
> Additionally add commentary to explain what's going on.
> 
> No functional change intended.
> 
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>

Reviewed-by: Pedro Falcato <pfalcato@suse.de>

-- 
Pedro
Re: [PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c
Posted by David Hildenbrand 2 months, 3 weeks ago
On 14.07.25 15:00, Lorenzo Stoakes wrote:
> The madvise() logic is inexplicably performed in mm/mseal.c - this ought to
> be located in mm/madvise.c.
> 
> Additionally can_modify_vma_madv() is inconsistently named and, in
> combination with is_ro_anon(), is very confusing logic.
> 
> Put a static function in mm/madvise.c instead - can_madvise_modify() - that
> spells out exactly what's happening. Also explicitly check for an anon VMA.
> 
> Additionally add commentary to explain what's going on.
> 
> No functional change intended.
> 
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
>   mm/madvise.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>   mm/mseal.c   | 49 -----------------------------------------
>   mm/vma.h     |  7 ------
>   3 files changed, 61 insertions(+), 57 deletions(-)
> 
> diff --git a/mm/madvise.c b/mm/madvise.c
> index 9de9b7c797c6..75757ba418a8 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -19,6 +19,7 @@
>   #include <linux/sched.h>
>   #include <linux/sched/mm.h>
>   #include <linux/mm_inline.h>
> +#include <linux/mmu_context.h>
>   #include <linux/string.h>
>   #include <linux/uio.h>
>   #include <linux/ksm.h>
> @@ -1256,6 +1257,65 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
>   			       &guard_remove_walk_ops, NULL);
>   }
> 
> +#ifdef CONFIG_64BIT

It's consistent with mm/Makefile etc. but having a simple

config MSEAL
	def_bool y if 64BIT

or sth like that would surely clean that up further.

> +/* Does the madvise operation result in discarding of mapped data? */
> +static bool is_discard(int behavior)
> +{
> +	switch (behavior) {
> +	case MADV_FREE:
> +	case MADV_DONTNEED:
> +	case MADV_DONTNEED_LOCKED:
> +	case MADV_REMOVE:
> +	case MADV_DONTFORK:
> +	case MADV_WIPEONFORK:
> +	case MADV_GUARD_INSTALL:
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +/*
> + * We are restricted from madvise()'ing mseal()'d VMAs only in very particular
> + * circumstances - discarding of data from read-only anonymous SEALED mappings.
> + *
> + * This is because users cannot trivally discard data from these VMAs, and may

s/trivally/trivially/

> + * only do so via an appropriate madvise() call.
> + */
> +static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
> +{
> +	struct vm_area_struct *vma = madv_behavior->vma;
> +
> +	/* If the operation won't discard, we're good. */
> +	if (!is_discard(madv_behavior->behavior))
> +		return true;


Conceptually, I would do this first and then handle all the discard 
cases / exceptions.

> +
> +	/* If the VMA isn't sealed we're also good. */
> +	if (can_modify_vma(vma))
> +		return true;
 > +> +	/* File-backed mappings don't lose data on discard. */
> +	if (!vma_is_anonymous(vma))
> +		return true;
> +
> +	/*
> +	 * If the VMA is writable and the architecture permits write access,
> +	 * then discarding is fine.
> +	 */
> +	if ((vma->vm_flags & VM_WRITE) &&
> +	    arch_vma_access_permitted(vma, /* write= */ true,
> +			/* execute= */ false, /* foreign= */ false))
> +		return true;
> +
> +	return false;
> +}
> +#else
> +static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
> +{
> +	return true;
> +}
> +#endif
> +
>   /*

LGTM

-- 
Cheers,

David / dhildenb
Re: [PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c
Posted by Lorenzo Stoakes 2 months, 3 weeks ago
On Mon, Jul 14, 2025 at 04:37:30PM +0200, David Hildenbrand wrote:
> On 14.07.25 15:00, Lorenzo Stoakes wrote:
> > The madvise() logic is inexplicably performed in mm/mseal.c - this ought to
> > be located in mm/madvise.c.
> >
> > Additionally can_modify_vma_madv() is inconsistently named and, in
> > combination with is_ro_anon(), is very confusing logic.
> >
> > Put a static function in mm/madvise.c instead - can_madvise_modify() - that
> > spells out exactly what's happening. Also explicitly check for an anon VMA.
> >
> > Additionally add commentary to explain what's going on.
> >
> > No functional change intended.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> >   mm/madvise.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++-
> >   mm/mseal.c   | 49 -----------------------------------------
> >   mm/vma.h     |  7 ------
> >   3 files changed, 61 insertions(+), 57 deletions(-)
> >
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index 9de9b7c797c6..75757ba418a8 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -19,6 +19,7 @@
> >   #include <linux/sched.h>
> >   #include <linux/sched/mm.h>
> >   #include <linux/mm_inline.h>
> > +#include <linux/mmu_context.h>
> >   #include <linux/string.h>
> >   #include <linux/uio.h>
> >   #include <linux/ksm.h>
> > @@ -1256,6 +1257,65 @@ static long madvise_guard_remove(struct madvise_behavior *madv_behavior)
> >   			       &guard_remove_walk_ops, NULL);
> >   }
> >
> > +#ifdef CONFIG_64BIT
>
> It's consistent with mm/Makefile etc. but having a simple
>
> config MSEAL
> 	def_bool y if 64BIT
>
> or sth like that would surely clean that up further.

Well, I plan to make this not a thing soon so I'd rather not.

The intent is to make _all_ VMA flags work on 32-bit kernels. I have done some
preparatory work and next cycle intend to do more on this.

So I'd rather avoid any config changes on this until I've given this a shot.

>
> > +/* Does the madvise operation result in discarding of mapped data? */
> > +static bool is_discard(int behavior)
> > +{
> > +	switch (behavior) {
> > +	case MADV_FREE:
> > +	case MADV_DONTNEED:
> > +	case MADV_DONTNEED_LOCKED:
> > +	case MADV_REMOVE:
> > +	case MADV_DONTFORK:
> > +	case MADV_WIPEONFORK:
> > +	case MADV_GUARD_INSTALL:
> > +		return true;
> > +	}
> > +
> > +	return false;
> > +}
> > +
> > +/*
> > + * We are restricted from madvise()'ing mseal()'d VMAs only in very particular
> > + * circumstances - discarding of data from read-only anonymous SEALED mappings.
> > + *
> > + * This is because users cannot trivally discard data from these VMAs, and may
>
> s/trivally/trivially/

Ack thanks - Andrew can you fixup? Can send a fix-patch otherwise.

>
> > + * only do so via an appropriate madvise() call.
> > + */
> > +static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
> > +{
> > +	struct vm_area_struct *vma = madv_behavior->vma;
> > +
> > +	/* If the operation won't discard, we're good. */
> > +	if (!is_discard(madv_behavior->behavior))
> > +		return true;
>
>
> Conceptually, I would do this first and then handle all the discard cases /
> exceptions.

Hm I'm confused :P we do do this first? I think the idea with this is we can
very cheaply ignore any MADV_ that isn't applicable.

Did you mean to put this comment under line below?

I mean it's not exactly a perf hotspot so don't mind moving them around.

>
> > +
> > +	/* If the VMA isn't sealed we're also good. */
> > +	if (can_modify_vma(vma))
> > +		return true;
> > +> +	/* File-backed mappings don't lose data on discard. */
> > +	if (!vma_is_anonymous(vma))
> > +		return true;
> > +
> > +	/*
> > +	 * If the VMA is writable and the architecture permits write access,
> > +	 * then discarding is fine.
> > +	 */
> > +	if ((vma->vm_flags & VM_WRITE) &&
> > +	    arch_vma_access_permitted(vma, /* write= */ true,
> > +			/* execute= */ false, /* foreign= */ false))
> > +		return true;
> > +
> > +	return false;
> > +}
> > +#else
> > +static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
> > +{
> > +	return true;
> > +}
> > +#endif
> > +
> >   /*
>
> LGTM

Cheers!

>
> --
> Cheers,
>
> David / dhildenb
>
Re: [PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c
Posted by David Hildenbrand 2 months, 3 weeks ago
>> or sth like that would surely clean that up further.
> 
> Well, I plan to make this not a thing soon so I'd rather not.
> 
> The intent is to make _all_ VMA flags work on 32-bit kernels. I have done some
> preparatory work and next cycle intend to do more on this.
> 
> So I'd rather avoid any config changes on this until I've given this a shot.

Sure, if that is in sight.

>>
>>> +/* Does the madvise operation result in discarding of mapped data? */
>>> +static bool is_discard(int behavior)
>>> +{
>>> +	switch (behavior) {
>>> +	case MADV_FREE:
>>> +	case MADV_DONTNEED:
>>> +	case MADV_DONTNEED_LOCKED:
>>> +	case MADV_REMOVE:
>>> +	case MADV_DONTFORK:
>>> +	case MADV_WIPEONFORK:
>>> +	case MADV_GUARD_INSTALL:
>>> +		return true;
>>> +	}
>>> +
>>> +	return false;
>>> +}
>>> +
>>> +/*
>>> + * We are restricted from madvise()'ing mseal()'d VMAs only in very particular
>>> + * circumstances - discarding of data from read-only anonymous SEALED mappings.
>>> + *
>>> + * This is because users cannot trivally discard data from these VMAs, and may
>>
>> s/trivally/trivially/
> 
> Ack thanks - Andrew can you fixup? Can send a fix-patch otherwise.
> 
>>
>>> + * only do so via an appropriate madvise() call.
>>> + */
>>> +static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
>>> +{
>>> +	struct vm_area_struct *vma = madv_behavior->vma;
>>> +
>>> +	/* If the operation won't discard, we're good. */
>>> +	if (!is_discard(madv_behavior->behavior))
>>> +		return true;
>>
>>
>> Conceptually, I would do this first and then handle all the discard cases /
>> exceptions.
> 
> Hm I'm confused :P we do do this first? I think the idea with this is we can
> very cheaply ignore any MADV_ that isn't applicable.
> 
> Did you mean to put this comment under line below?
> 
> I mean it's not exactly a perf hotspot so don't mind moving them around.

I was thinking of this (start with sealed, then go into details about 
discards):

/* If the VMA isn't sealed, we're all good. */
if (can_modify_vma(vma))
	return true;

/* In a sealed VMA, we only care about discard operations. */
if (!is_discard(madv_behavior->behavior))
	return true;

/* But discards of file-backed mappings are fine. */
if (!vma_is_anonymous(vma))
	return true;

...


But now I wonder, why is it okay to discard anon pages in a MAP_PRIVATE 
file mapping?

-- 
Cheers,

David / dhildenb
Re: [PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c
Posted by Pedro Falcato 2 months, 3 weeks ago
On Mon, Jul 14, 2025 at 05:03:03PM +0200, David Hildenbrand wrote:
> [...] 
> 
> But now I wonder, why is it okay to discard anon pages in a MAP_PRIVATE file
> mapping?

IIRC this was originally suggested by Linus, on one of the versions introducing
mseal. But the gist is that discarding pages is okay if you could already zero
them manually, using e.g memset. Hence the writeability checks.

-- 
Pedro
Re: [PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c
Posted by David Hildenbrand 2 months, 3 weeks ago
On 14.07.25 17:31, Pedro Falcato wrote:
> On Mon, Jul 14, 2025 at 05:03:03PM +0200, David Hildenbrand wrote:
>> [...]
>>
>> But now I wonder, why is it okay to discard anon pages in a MAP_PRIVATE file
>> mapping?
> 
> IIRC this was originally suggested by Linus, on one of the versions introducing
> mseal. But the gist is that discarding pages is okay if you could already zero
> them manually, using e.g memset. Hence the writeability checks.

What you can do is

a) mmap(PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, ...)

b) modify content (write, whatever)

c) mprotect(PROT_READ)

d) mseal()

But then still do

madvise(MADV_DONTNEED)

to discard.


There is no writability anymore.

(Just a note that, with hugetlb, it is fairly common to 
mmap(MAP_PRIVATE) empty files and only work with anonymous pages.)

-- 
Cheers,

David / dhildenb
Re: [PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c
Posted by Lorenzo Stoakes 2 months, 3 weeks ago
On Mon, Jul 14, 2025 at 05:41:45PM +0200, David Hildenbrand wrote:
> On 14.07.25 17:31, Pedro Falcato wrote:
> > On Mon, Jul 14, 2025 at 05:03:03PM +0200, David Hildenbrand wrote:
> > > [...]
> > >
> > > But now I wonder, why is it okay to discard anon pages in a MAP_PRIVATE file
> > > mapping?
> >
> > IIRC this was originally suggested by Linus, on one of the versions introducing
> > mseal. But the gist is that discarding pages is okay if you could already zero
> > them manually, using e.g memset. Hence the writeability checks.
>
> What you can do is
>
> a) mmap(PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, ...)
>
> b) modify content (write, whatever)
>
> c) mprotect(PROT_READ)
>
> d) mseal()
>
> But then still do
>
> madvise(MADV_DONTNEED)
>
> to discard.
>
>
> There is no writability anymore.

Well, you can mprotect() writable it again :)

>
> (Just a note that, with hugetlb, it is fairly common to mmap(MAP_PRIVATE)
> empty files and only work with anonymous pages.)

This just makes me think that we should be checking VM_MAYWRITE anyway which
squares this circle.

Otherwise you can do 'workarounds' with mprotect() generally.

It's really only meaningful for a MAP_PRIVATE file-backed mapping of a read-only
file.

>
> --
> Cheers,
>
> David / dhildenb
>
Re: [PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c
Posted by David Hildenbrand 2 months, 3 weeks ago
On 14.07.25 17:45, Lorenzo Stoakes wrote:
> On Mon, Jul 14, 2025 at 05:41:45PM +0200, David Hildenbrand wrote:
>> On 14.07.25 17:31, Pedro Falcato wrote:
>>> On Mon, Jul 14, 2025 at 05:03:03PM +0200, David Hildenbrand wrote:
>>>> [...]
>>>>
>>>> But now I wonder, why is it okay to discard anon pages in a MAP_PRIVATE file
>>>> mapping?
>>>
>>> IIRC this was originally suggested by Linus, on one of the versions introducing
>>> mseal. But the gist is that discarding pages is okay if you could already zero
>>> them manually, using e.g memset. Hence the writeability checks.
>>
>> What you can do is
>>
>> a) mmap(PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, ...)
>>
>> b) modify content (write, whatever)
>>
>> c) mprotect(PROT_READ)
>>
>> d) mseal()
>>
>> But then still do
>>
>> madvise(MADV_DONTNEED)
>>
>> to discard.
>>
>>
>> There is no writability anymore.
> 
> Well, you can mprotect() writable it again :)

Isn't that what sealing ... prohibits?

-- 
Cheers,

David / dhildenb
Re: [PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c
Posted by Lorenzo Stoakes 2 months, 3 weeks ago
On Mon, Jul 14, 2025 at 05:52:56PM +0200, David Hildenbrand wrote:
> On 14.07.25 17:45, Lorenzo Stoakes wrote:
> > On Mon, Jul 14, 2025 at 05:41:45PM +0200, David Hildenbrand wrote:
> > > On 14.07.25 17:31, Pedro Falcato wrote:
> > > > On Mon, Jul 14, 2025 at 05:03:03PM +0200, David Hildenbrand wrote:
> > > > > [...]
> > > > >
> > > > > But now I wonder, why is it okay to discard anon pages in a MAP_PRIVATE file
> > > > > mapping?
> > > >
> > > > IIRC this was originally suggested by Linus, on one of the versions introducing
> > > > mseal. But the gist is that discarding pages is okay if you could already zero
> > > > them manually, using e.g memset. Hence the writeability checks.
> > >
> > > What you can do is
> > >
> > > a) mmap(PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, ...)
> > >
> > > b) modify content (write, whatever)
> > >
> > > c) mprotect(PROT_READ)
> > >
> > > d) mseal()
> > >
> > > But then still do
> > >
> > > madvise(MADV_DONTNEED)
> > >
> > > to discard.
> > >
> > >
> > > There is no writability anymore.
> >
> > Well, you can mprotect() writable it again :)
>
> Isn't that what sealing ... prohibits?

Doh! Haha. Yes :) very good point...

OK I'll send the fix once this refactoring lands.

I think the check should simply be replaced with a vma->vm_flags & VM_SHARED
check then?

I wonder if the argument was that the MAP_PRIVATE file mapping is somehow
ephemeral anyway so not so problematic to lose. But I'm sure there are users who
feel differently about that...

Note that a truncate operation on the file will immediately drop the MAP_PRIVATE
anon backing anyway so it's sort of 'droppable' the whole time.

However, since a user is explicitly asking to seal the memory, and would have
expectation of seal semantics, so I agree that we need to fix this.

>
> --
> Cheers,
>
> David / dhildenb
>
Re: [PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c
Posted by Lorenzo Stoakes 2 months, 3 weeks ago
On Mon, Jul 14, 2025 at 04:31:59PM +0100, Pedro Falcato wrote:
> On Mon, Jul 14, 2025 at 05:03:03PM +0200, David Hildenbrand wrote:
> > [...]
> >
> > But now I wonder, why is it okay to discard anon pages in a MAP_PRIVATE file
> > mapping?
>
> IIRC this was originally suggested by Linus, on one of the versions introducing
> mseal. But the gist is that discarding pages is okay if you could already zero
> them manually, using e.g memset. Hence the writeability checks.

Right, and if it's read-only and MAP_PRIVATE it doesn't really matter. In which
case we needn't worry.

I wonder if we need to check for VM_MAYWRITE though...

>
> --
> Pedro
Re: [PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c
Posted by Lorenzo Stoakes 2 months, 3 weeks ago
On Mon, Jul 14, 2025 at 05:03:03PM +0200, David Hildenbrand wrote:
> > > or sth like that would surely clean that up further.
> >
> > Well, I plan to make this not a thing soon so I'd rather not.
> >
> > The intent is to make _all_ VMA flags work on 32-bit kernels. I have done some
> > preparatory work and next cycle intend to do more on this.
> >
> > So I'd rather avoid any config changes on this until I've given this a shot.
>
> Sure, if that is in sight.

Yes :)

> > > > + * only do so via an appropriate madvise() call.
> > > > + */
> > > > +static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
> > > > +{
> > > > +	struct vm_area_struct *vma = madv_behavior->vma;
> > > > +
> > > > +	/* If the operation won't discard, we're good. */
> > > > +	if (!is_discard(madv_behavior->behavior))
> > > > +		return true;
> > >
> > >
> > > Conceptually, I would do this first and then handle all the discard cases /
> > > exceptions.
> >
> > Hm I'm confused :P we do do this first? I think the idea with this is we can
> > very cheaply ignore any MADV_ that isn't applicable.
> >
> > Did you mean to put this comment under line below?
> >
> > I mean it's not exactly a perf hotspot so don't mind moving them around.
>
> I was thinking of this (start with sealed, then go into details about
> discards):
>
> /* If the VMA isn't sealed, we're all good. */
> if (can_modify_vma(vma))
> 	return true;
>
> /* In a sealed VMA, we only care about discard operations. */
> if (!is_discard(madv_behavior->behavior))
> 	return true;
>
> /* But discards of file-backed mappings are fine. */
> if (!vma_is_anonymous(vma))
> 	return true;

Right yeah.

>
> ...
>
>
> But now I wonder, why is it okay to discard anon pages in a MAP_PRIVATE file
> mapping?

I'm duplicating existing logic here (well updating from the vma->vm_file check
and a seemingly pointless !vma->vm_file && vma->vm_flags & VM_SHARED check), but
this is a good point...

For the purposes of the refactoring I guess best to keep the logic ostensibly
the same given the 'no functional change intended', but we do need to fix this
yes.

That change would probably be better as a follow-up with a test change added
too.

But I agree this is an oversight here afaict.

>
> --
> Cheers,
>
> David / dhildenb
>
Re: [PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c
Posted by David Hildenbrand 2 months, 3 weeks ago
On 14.07.25 17:18, Lorenzo Stoakes wrote:
> On Mon, Jul 14, 2025 at 05:03:03PM +0200, David Hildenbrand wrote:
>>>> or sth like that would surely clean that up further.
>>>
>>> Well, I plan to make this not a thing soon so I'd rather not.
>>>
>>> The intent is to make _all_ VMA flags work on 32-bit kernels. I have done some
>>> preparatory work and next cycle intend to do more on this.
>>>
>>> So I'd rather avoid any config changes on this until I've given this a shot.
>>
>> Sure, if that is in sight.
> 
> Yes :)
> 
>>>>> + * only do so via an appropriate madvise() call.
>>>>> + */
>>>>> +static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
>>>>> +{
>>>>> +	struct vm_area_struct *vma = madv_behavior->vma;
>>>>> +
>>>>> +	/* If the operation won't discard, we're good. */
>>>>> +	if (!is_discard(madv_behavior->behavior))
>>>>> +		return true;
>>>>
>>>>
>>>> Conceptually, I would do this first and then handle all the discard cases /
>>>> exceptions.
>>>
>>> Hm I'm confused :P we do do this first? I think the idea with this is we can
>>> very cheaply ignore any MADV_ that isn't applicable.
>>>
>>> Did you mean to put this comment under line below?
>>>
>>> I mean it's not exactly a perf hotspot so don't mind moving them around.
>>
>> I was thinking of this (start with sealed, then go into details about
>> discards):
>>
>> /* If the VMA isn't sealed, we're all good. */
>> if (can_modify_vma(vma))
>> 	return true;
>>
>> /* In a sealed VMA, we only care about discard operations. */
>> if (!is_discard(madv_behavior->behavior))
>> 	return true;
>>
>> /* But discards of file-backed mappings are fine. */
>> if (!vma_is_anonymous(vma))
>> 	return true;
> 
> Right yeah.
> 
>>
>> ...
>>
>>
>> But now I wonder, why is it okay to discard anon pages in a MAP_PRIVATE file
>> mapping?
> 
> I'm duplicating existing logic here (well updating from the vma->vm_file check
> and a seemingly pointless !vma->vm_file && vma->vm_flags & VM_SHARED check), but
> this is a good point...

Yeah, not blaming you, just scratching my head :)

> 
> For the purposes of the refactoring I guess best to keep the logic ostensibly
> the same given the 'no functional change intended', but we do need to fix this
> yes.


Likely a fix should be pulled in early? Not sure ... but it sure sounds 
broken.

-- 
Cheers,

David / dhildenb
Re: [PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c
Posted by Lorenzo Stoakes 2 months, 3 weeks ago
On Mon, Jul 14, 2025 at 05:24:14PM +0200, David Hildenbrand wrote:
> On 14.07.25 17:18, Lorenzo Stoakes wrote:
> > On Mon, Jul 14, 2025 at 05:03:03PM +0200, David Hildenbrand wrote:
> > > > > or sth like that would surely clean that up further.
> > > >
> > > > Well, I plan to make this not a thing soon so I'd rather not.
> > > >
> > > > The intent is to make _all_ VMA flags work on 32-bit kernels. I have done some
> > > > preparatory work and next cycle intend to do more on this.
> > > >
> > > > So I'd rather avoid any config changes on this until I've given this a shot.
> > >
> > > Sure, if that is in sight.
> >
> > Yes :)
> >
> > > > > > + * only do so via an appropriate madvise() call.
> > > > > > + */
> > > > > > +static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
> > > > > > +{
> > > > > > +	struct vm_area_struct *vma = madv_behavior->vma;
> > > > > > +
> > > > > > +	/* If the operation won't discard, we're good. */
> > > > > > +	if (!is_discard(madv_behavior->behavior))
> > > > > > +		return true;
> > > > >
> > > > >
> > > > > Conceptually, I would do this first and then handle all the discard cases /
> > > > > exceptions.
> > > >
> > > > Hm I'm confused :P we do do this first? I think the idea with this is we can
> > > > very cheaply ignore any MADV_ that isn't applicable.
> > > >
> > > > Did you mean to put this comment under line below?
> > > >
> > > > I mean it's not exactly a perf hotspot so don't mind moving them around.
> > >
> > > I was thinking of this (start with sealed, then go into details about
> > > discards):
> > >
> > > /* If the VMA isn't sealed, we're all good. */
> > > if (can_modify_vma(vma))
> > > 	return true;
> > >
> > > /* In a sealed VMA, we only care about discard operations. */
> > > if (!is_discard(madv_behavior->behavior))
> > > 	return true;
> > >
> > > /* But discards of file-backed mappings are fine. */
> > > if (!vma_is_anonymous(vma))
> > > 	return true;
> >
> > Right yeah.
> >
> > >
> > > ...
> > >
> > >
> > > But now I wonder, why is it okay to discard anon pages in a MAP_PRIVATE file
> > > mapping?
> >
> > I'm duplicating existing logic here (well updating from the vma->vm_file check
> > and a seemingly pointless !vma->vm_file && vma->vm_flags & VM_SHARED check), but
> > this is a good point...
>
> Yeah, not blaming you, just scratching my head :)
>
> >
> > For the purposes of the refactoring I guess best to keep the logic ostensibly
> > the same given the 'no functional change intended', but we do need to fix this
> > yes.
>
>
> Likely a fix should be pulled in early? Not sure ... but it sure sounds
> broken.

Once this lands in mm-new I can send a fix (like litearlly tomorrow if it lands
:P). I just don't think a functional change belongs as part of a refactoring.

I worry that we'll get catch-22 caught on the (numerous) problems with the mseal
implementation and thus not provide a foundation upon which to fix them...

>
> --
> Cheers,
>
> David / dhildenb
>
Re: [PATCH 2/5] mm/mseal: move madvise() logic to mm/madvise.c
Posted by David Hildenbrand 2 months, 3 weeks ago
On 14.07.25 17:27, Lorenzo Stoakes wrote:
> On Mon, Jul 14, 2025 at 05:24:14PM +0200, David Hildenbrand wrote:
>> On 14.07.25 17:18, Lorenzo Stoakes wrote:
>>> On Mon, Jul 14, 2025 at 05:03:03PM +0200, David Hildenbrand wrote:
>>>>>> or sth like that would surely clean that up further.
>>>>>
>>>>> Well, I plan to make this not a thing soon so I'd rather not.
>>>>>
>>>>> The intent is to make _all_ VMA flags work on 32-bit kernels. I have done some
>>>>> preparatory work and next cycle intend to do more on this.
>>>>>
>>>>> So I'd rather avoid any config changes on this until I've given this a shot.
>>>>
>>>> Sure, if that is in sight.
>>>
>>> Yes :)
>>>
>>>>>>> + * only do so via an appropriate madvise() call.
>>>>>>> + */
>>>>>>> +static bool can_madvise_modify(struct madvise_behavior *madv_behavior)
>>>>>>> +{
>>>>>>> +	struct vm_area_struct *vma = madv_behavior->vma;
>>>>>>> +
>>>>>>> +	/* If the operation won't discard, we're good. */
>>>>>>> +	if (!is_discard(madv_behavior->behavior))
>>>>>>> +		return true;
>>>>>>
>>>>>>
>>>>>> Conceptually, I would do this first and then handle all the discard cases /
>>>>>> exceptions.
>>>>>
>>>>> Hm I'm confused :P we do do this first? I think the idea with this is we can
>>>>> very cheaply ignore any MADV_ that isn't applicable.
>>>>>
>>>>> Did you mean to put this comment under line below?
>>>>>
>>>>> I mean it's not exactly a perf hotspot so don't mind moving them around.
>>>>
>>>> I was thinking of this (start with sealed, then go into details about
>>>> discards):
>>>>
>>>> /* If the VMA isn't sealed, we're all good. */
>>>> if (can_modify_vma(vma))
>>>> 	return true;
>>>>
>>>> /* In a sealed VMA, we only care about discard operations. */
>>>> if (!is_discard(madv_behavior->behavior))
>>>> 	return true;
>>>>
>>>> /* But discards of file-backed mappings are fine. */
>>>> if (!vma_is_anonymous(vma))
>>>> 	return true;
>>>
>>> Right yeah.
>>>
>>>>
>>>> ...
>>>>
>>>>
>>>> But now I wonder, why is it okay to discard anon pages in a MAP_PRIVATE file
>>>> mapping?
>>>
>>> I'm duplicating existing logic here (well updating from the vma->vm_file check
>>> and a seemingly pointless !vma->vm_file && vma->vm_flags & VM_SHARED check), but
>>> this is a good point...
>>
>> Yeah, not blaming you, just scratching my head :)
>>
>>>
>>> For the purposes of the refactoring I guess best to keep the logic ostensibly
>>> the same given the 'no functional change intended', but we do need to fix this
>>> yes.
>>
>>
>> Likely a fix should be pulled in early? Not sure ... but it sure sounds
>> broken.
> 
> Once this lands in mm-new I can send a fix (like litearlly tomorrow if it lands
> :P). I just don't think a functional change belongs as part of a refactoring.
> 
> I worry that we'll get catch-22 caught on the (numerous) problems with the mseal
> implementation and thus not provide a foundation upon which to fix them...

Not sure what's right or wrong at this point. The cover letter only 
talked about anonymous memory:

"
Some destructive madvice() behaviors (e.g. MADV_DONTNEED) for anonymous
memory, when users don't have write permission to the memory. Those
behaviors can alter region contents by discarding pages, effectively
a memset(0) for anonymous memory.
"

We're similarly in the domain of altering memory content that was there 
(resetting it back to whatever the file provided).

It does like something that is not desired, but no idea how security 
relevant it would be. Probably a corner case we can indeed fixup separately.

-- 
Cheers,

David / dhildenb