[v1] mm/userfaultfd: modulize memory types

[PATCH 4/4] mm: Apply vm_uffd_ops API to core mm

Posted by Peter Xu 3 months, 2 weeks ago

This patch completely moves the old userfaultfd core to use the new
vm_uffd_ops API.  After this change, existing file systems will start to
use the new API for userfault operations.

When at it, moving vma_can_userfault() into mm/userfaultfd.c instead,
because it's getting too big.  It's only used in slow paths so it shouldn't
be an issue.

This will also remove quite some hard-coded checks for either shmem or
hugetlbfs.  Now all the old checks should still work but with vm_uffd_ops.

Note that anonymous memory will still need to be processed separately
because it doesn't have vm_ops at all.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 include/linux/shmem_fs.h      |  14 -----
 include/linux/userfaultfd_k.h |  46 ++++----------
 mm/shmem.c                    |   2 +-
 mm/userfaultfd.c              | 115 +++++++++++++++++++++++++---------
 4 files changed, 101 insertions(+), 76 deletions(-)

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 6d0f9c599ff7..2f5b7b295cf6 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -195,20 +195,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof)
 extern bool shmem_charge(struct inode *inode, long pages);
 extern void shmem_uncharge(struct inode *inode, long pages);
 
-#ifdef CONFIG_USERFAULTFD
-#ifdef CONFIG_SHMEM
-extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
-				  struct vm_area_struct *dst_vma,
-				  unsigned long dst_addr,
-				  unsigned long src_addr,
-				  uffd_flags_t flags,
-				  struct folio **foliop);
-#else /* !CONFIG_SHMEM */
-#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \
-			       src_addr, flags, foliop) ({ BUG(); 0; })
-#endif /* CONFIG_SHMEM */
-#endif /* CONFIG_USERFAULTFD */
-
 /*
  * Used space is stored as unsigned 64-bit value in bytes but
  * quota core supports only signed 64-bit values so use that
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index e79c724b3b95..4e56ad423a4a 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -85,9 +85,14 @@ extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr))
 #define MFILL_ATOMIC_MODE_MASK ((__force uffd_flags_t) (MFILL_ATOMIC_BIT(0) - 1))
 
+static inline enum mfill_atomic_mode uffd_flags_get_mode(uffd_flags_t flags)
+{
+	return (enum mfill_atomic_mode)(flags & MFILL_ATOMIC_MODE_MASK);
+}
+
 static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode expected)
 {
-	return (flags & MFILL_ATOMIC_MODE_MASK) == ((__force uffd_flags_t) expected);
+	return uffd_flags_get_mode(flags) == expected;
 }
 
 static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode)
@@ -196,41 +201,16 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 	return vma->vm_flags & __VM_UFFD_FLAGS;
 }
 
-static inline bool vma_can_userfault(struct vm_area_struct *vma,
-				     unsigned long vm_flags,
-				     bool wp_async)
+static inline const vm_uffd_ops *vma_get_uffd_ops(struct vm_area_struct *vma)
 {
-	vm_flags &= __VM_UFFD_FLAGS;
-
-	if (vma->vm_flags & VM_DROPPABLE)
-		return false;
-
-	if ((vm_flags & VM_UFFD_MINOR) &&
-	    (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
-		return false;
-
-	/*
-	 * If wp async enabled, and WP is the only mode enabled, allow any
-	 * memory type.
-	 */
-	if (wp_async && (vm_flags == VM_UFFD_WP))
-		return true;
-
-#ifndef CONFIG_PTE_MARKER_UFFD_WP
-	/*
-	 * If user requested uffd-wp but not enabled pte markers for
-	 * uffd-wp, then shmem & hugetlbfs are not supported but only
-	 * anonymous.
-	 */
-	if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
-		return false;
-#endif
-
-	/* By default, allow any of anon|shmem|hugetlb */
-	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
-	    vma_is_shmem(vma);
+	if (vma->vm_ops && vma->vm_ops->userfaultfd_ops)
+		return vma->vm_ops->userfaultfd_ops;
+	return NULL;
 }
 
+bool vma_can_userfault(struct vm_area_struct *vma,
+		       unsigned long vm_flags, bool wp_async);
+
 static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
 {
 	struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx;
diff --git a/mm/shmem.c b/mm/shmem.c
index bd0a29000318..4d71fc7be358 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3158,7 +3158,7 @@ static int shmem_uffd_get_folio(struct inode *inode, pgoff_t pgoff,
 	return shmem_get_folio(inode, pgoff, 0, folio, SGP_NOALLOC);
 }
 
-int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
+static int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
 			   struct vm_area_struct *dst_vma,
 			   unsigned long dst_addr,
 			   unsigned long src_addr,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 879505c6996f..61783ff2d335 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -14,12 +14,48 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/mmu_notifier.h>
 #include <linux/hugetlb.h>
-#include <linux/shmem_fs.h>
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include "internal.h"
 #include "swap.h"
 
+bool vma_can_userfault(struct vm_area_struct *vma,
+		       unsigned long vm_flags, bool wp_async)
+{
+	unsigned long supported;
+
+	if (vma->vm_flags & VM_DROPPABLE)
+		return false;
+
+	vm_flags &= __VM_UFFD_FLAGS;
+
+#ifndef CONFIG_PTE_MARKER_UFFD_WP
+	/*
+	 * If user requested uffd-wp but not enabled pte markers for
+	 * uffd-wp, then any file system (like shmem or hugetlbfs) are not
+	 * supported but only anonymous.
+	 */
+	if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
+		return false;
+#endif
+	/*
+	 * If wp async enabled, and WP is the only mode enabled, allow any
+	 * memory type.
+	 */
+	if (wp_async && (vm_flags == VM_UFFD_WP))
+		return true;
+
+	if (vma_is_anonymous(vma))
+		/* Anonymous has no page cache, MINOR not supported */
+		supported = VM_UFFD_MISSING | VM_UFFD_WP;
+	else if (vma_get_uffd_ops(vma))
+		supported = vma_get_uffd_ops(vma)->uffd_features;
+	else
+		return false;
+
+	return !(vm_flags & (~supported));
+}
+
 static __always_inline
 bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
 {
@@ -384,11 +420,15 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
 {
 	struct inode *inode = file_inode(dst_vma->vm_file);
 	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+	const vm_uffd_ops *uffd_ops = vma_get_uffd_ops(dst_vma);
 	struct folio *folio;
 	struct page *page;
 	int ret;
 
-	ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
+	if (WARN_ON_ONCE(!uffd_ops || !uffd_ops->uffd_get_folio))
+		return -EINVAL;
+
+	ret = uffd_ops->uffd_get_folio(inode, pgoff, &folio);
 	/* Our caller expects us to return -EFAULT if we failed to find folio */
 	if (ret == -ENOENT)
 		ret = -EFAULT;
@@ -504,18 +544,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
 	u32 hash;
 	struct address_space *mapping;
 
-	/*
-	 * There is no default zero huge page for all huge page sizes as
-	 * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
-	 * by THP.  Since we can not reliably insert a zero page, this
-	 * feature is not supported.
-	 */
-	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
-		up_read(&ctx->map_changing_lock);
-		uffd_mfill_unlock(dst_vma);
-		return -EINVAL;
-	}
-
 	src_addr = src_start;
 	dst_addr = dst_start;
 	copied = 0;
@@ -686,14 +714,55 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
 			err = mfill_atomic_pte_zeropage(dst_pmd,
 						 dst_vma, dst_addr);
 	} else {
-		err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
-					     dst_addr, src_addr,
-					     flags, foliop);
+		const vm_uffd_ops *uffd_ops = vma_get_uffd_ops(dst_vma);
+
+		if (WARN_ON_ONCE(!uffd_ops || !uffd_ops->uffd_copy)) {
+			err = -EINVAL;
+		} else {
+			err = uffd_ops->uffd_copy(dst_pmd, dst_vma,
+						  dst_addr, src_addr,
+						  flags, foliop);
+		}
 	}
 
 	return err;
 }
 
+static inline bool
+vma_uffd_ops_supported(struct vm_area_struct *vma, uffd_flags_t flags)
+{
+	enum mfill_atomic_mode mode = uffd_flags_get_mode(flags);
+	const vm_uffd_ops *uffd_ops;
+	unsigned long uffd_ioctls;
+
+	if ((flags & MFILL_ATOMIC_WP) && !(vma->vm_flags & VM_UFFD_WP))
+		return false;
+
+	/* Anonymous supports everything except CONTINUE */
+	if (vma_is_anonymous(vma))
+		return mode != MFILL_ATOMIC_CONTINUE;
+
+	uffd_ops = vma_get_uffd_ops(vma);
+	if (!uffd_ops)
+		return false;
+
+	uffd_ioctls = uffd_ops->uffd_ioctls;
+	switch (mode) {
+	case MFILL_ATOMIC_COPY:
+		return uffd_ioctls & BIT(_UFFDIO_COPY);
+	case MFILL_ATOMIC_ZEROPAGE:
+		return uffd_ioctls & BIT(_UFFDIO_ZEROPAGE);
+	case MFILL_ATOMIC_CONTINUE:
+		if (!(vma->vm_flags & VM_SHARED))
+			return false;
+		return uffd_ioctls & BIT(_UFFDIO_CONTINUE);
+	case MFILL_ATOMIC_POISON:
+		return uffd_ioctls & BIT(_UFFDIO_POISON);
+	default:
+		return false;
+	}
+}
+
 static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 					    unsigned long dst_start,
 					    unsigned long src_start,
@@ -752,11 +821,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	    dst_vma->vm_flags & VM_SHARED))
 		goto out_unlock;
 
-	/*
-	 * validate 'mode' now that we know the dst_vma: don't allow
-	 * a wrprotect copy if the userfaultfd didn't register as WP.
-	 */
-	if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
+	if (!vma_uffd_ops_supported(dst_vma, flags))
 		goto out_unlock;
 
 	/*
@@ -766,12 +831,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 		return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
 					     src_start, len, flags);
 
-	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
-		goto out_unlock;
-	if (!vma_is_shmem(dst_vma) &&
-	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
-		goto out_unlock;
-
 	while (src_addr < src_start + len) {
 		pmd_t dst_pmdval;
 
-- 
2.49.0

Re: [PATCH 4/4] mm: Apply vm_uffd_ops API to core mm

Posted by James Houghton 3 months, 2 weeks ago

On Fri, Jun 20, 2025 at 12:04 PM Peter Xu <peterx@redhat.com> wrote:
>
> This patch completely moves the old userfaultfd core to use the new
> vm_uffd_ops API.  After this change, existing file systems will start to
> use the new API for userfault operations.
>
> When at it, moving vma_can_userfault() into mm/userfaultfd.c instead,
> because it's getting too big.  It's only used in slow paths so it shouldn't
> be an issue.
>
> This will also remove quite some hard-coded checks for either shmem or
> hugetlbfs.  Now all the old checks should still work but with vm_uffd_ops.
>
> Note that anonymous memory will still need to be processed separately
> because it doesn't have vm_ops at all.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  include/linux/shmem_fs.h      |  14 -----
>  include/linux/userfaultfd_k.h |  46 ++++----------
>  mm/shmem.c                    |   2 +-
>  mm/userfaultfd.c              | 115 +++++++++++++++++++++++++---------
>  4 files changed, 101 insertions(+), 76 deletions(-)
>
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index 6d0f9c599ff7..2f5b7b295cf6 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -195,20 +195,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof)
>  extern bool shmem_charge(struct inode *inode, long pages);
>  extern void shmem_uncharge(struct inode *inode, long pages);
>
> -#ifdef CONFIG_USERFAULTFD
> -#ifdef CONFIG_SHMEM
> -extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
> -                                 struct vm_area_struct *dst_vma,
> -                                 unsigned long dst_addr,
> -                                 unsigned long src_addr,
> -                                 uffd_flags_t flags,
> -                                 struct folio **foliop);
> -#else /* !CONFIG_SHMEM */
> -#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \
> -                              src_addr, flags, foliop) ({ BUG(); 0; })
> -#endif /* CONFIG_SHMEM */
> -#endif /* CONFIG_USERFAULTFD */
> -
>  /*
>   * Used space is stored as unsigned 64-bit value in bytes but
>   * quota core supports only signed 64-bit values so use that
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index e79c724b3b95..4e56ad423a4a 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -85,9 +85,14 @@ extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
>  #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr))
>  #define MFILL_ATOMIC_MODE_MASK ((__force uffd_flags_t) (MFILL_ATOMIC_BIT(0) - 1))
>
> +static inline enum mfill_atomic_mode uffd_flags_get_mode(uffd_flags_t flags)
> +{
> +       return (enum mfill_atomic_mode)(flags & MFILL_ATOMIC_MODE_MASK);
> +}
> +
>  static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode expected)
>  {
> -       return (flags & MFILL_ATOMIC_MODE_MASK) == ((__force uffd_flags_t) expected);
> +       return uffd_flags_get_mode(flags) == expected;
>  }
>
>  static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode)
> @@ -196,41 +201,16 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
>         return vma->vm_flags & __VM_UFFD_FLAGS;
>  }
>
> -static inline bool vma_can_userfault(struct vm_area_struct *vma,
> -                                    unsigned long vm_flags,
> -                                    bool wp_async)
> +static inline const vm_uffd_ops *vma_get_uffd_ops(struct vm_area_struct *vma)
>  {
> -       vm_flags &= __VM_UFFD_FLAGS;
> -
> -       if (vma->vm_flags & VM_DROPPABLE)
> -               return false;
> -
> -       if ((vm_flags & VM_UFFD_MINOR) &&
> -           (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
> -               return false;
> -
> -       /*
> -        * If wp async enabled, and WP is the only mode enabled, allow any
> -        * memory type.
> -        */
> -       if (wp_async && (vm_flags == VM_UFFD_WP))
> -               return true;
> -
> -#ifndef CONFIG_PTE_MARKER_UFFD_WP
> -       /*
> -        * If user requested uffd-wp but not enabled pte markers for
> -        * uffd-wp, then shmem & hugetlbfs are not supported but only
> -        * anonymous.
> -        */
> -       if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
> -               return false;
> -#endif

Hi Peter,

Thanks for this cleanup!

It looks like the above two checks, the wp-async one and the PTE
marker check, have been reordered in this patch. Does this result in a
functional difference?

The rest of this series looks fine to me. :)

> -
> -       /* By default, allow any of anon|shmem|hugetlb */
> -       return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
> -           vma_is_shmem(vma);
> +       if (vma->vm_ops && vma->vm_ops->userfaultfd_ops)
> +               return vma->vm_ops->userfaultfd_ops;
> +       return NULL;
>  }
>
> +bool vma_can_userfault(struct vm_area_struct *vma,
> +                      unsigned long vm_flags, bool wp_async);
> +
>  static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
>  {
>         struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx;
> diff --git a/mm/shmem.c b/mm/shmem.c
> index bd0a29000318..4d71fc7be358 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -3158,7 +3158,7 @@ static int shmem_uffd_get_folio(struct inode *inode, pgoff_t pgoff,
>         return shmem_get_folio(inode, pgoff, 0, folio, SGP_NOALLOC);
>  }
>
> -int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
> +static int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
>                            struct vm_area_struct *dst_vma,
>                            unsigned long dst_addr,
>                            unsigned long src_addr,
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 879505c6996f..61783ff2d335 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -14,12 +14,48 @@
>  #include <linux/userfaultfd_k.h>
>  #include <linux/mmu_notifier.h>
>  #include <linux/hugetlb.h>
> -#include <linux/shmem_fs.h>
>  #include <asm/tlbflush.h>
>  #include <asm/tlb.h>
>  #include "internal.h"
>  #include "swap.h"
>
> +bool vma_can_userfault(struct vm_area_struct *vma,
> +                      unsigned long vm_flags, bool wp_async)
> +{
> +       unsigned long supported;
> +
> +       if (vma->vm_flags & VM_DROPPABLE)
> +               return false;
> +
> +       vm_flags &= __VM_UFFD_FLAGS;
> +
> +#ifndef CONFIG_PTE_MARKER_UFFD_WP
> +       /*
> +        * If user requested uffd-wp but not enabled pte markers for
> +        * uffd-wp, then any file system (like shmem or hugetlbfs) are not
> +        * supported but only anonymous.
> +        */
> +       if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
> +               return false;
> +#endif
> +       /*
> +        * If wp async enabled, and WP is the only mode enabled, allow any
> +        * memory type.
> +        */
> +       if (wp_async && (vm_flags == VM_UFFD_WP))
> +               return true;
> +
> +       if (vma_is_anonymous(vma))
> +               /* Anonymous has no page cache, MINOR not supported */
> +               supported = VM_UFFD_MISSING | VM_UFFD_WP;
> +       else if (vma_get_uffd_ops(vma))
> +               supported = vma_get_uffd_ops(vma)->uffd_features;
> +       else
> +               return false;
> +
> +       return !(vm_flags & (~supported));
> +}
> +
>  static __always_inline
>  bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
>  {
> @@ -384,11 +420,15 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
>  {
>         struct inode *inode = file_inode(dst_vma->vm_file);
>         pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
> +       const vm_uffd_ops *uffd_ops = vma_get_uffd_ops(dst_vma);
>         struct folio *folio;
>         struct page *page;
>         int ret;
>
> -       ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
> +       if (WARN_ON_ONCE(!uffd_ops || !uffd_ops->uffd_get_folio))
> +               return -EINVAL;
> +
> +       ret = uffd_ops->uffd_get_folio(inode, pgoff, &folio);
>         /* Our caller expects us to return -EFAULT if we failed to find folio */
>         if (ret == -ENOENT)
>                 ret = -EFAULT;
> @@ -504,18 +544,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
>         u32 hash;
>         struct address_space *mapping;
>
> -       /*
> -        * There is no default zero huge page for all huge page sizes as
> -        * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
> -        * by THP.  Since we can not reliably insert a zero page, this
> -        * feature is not supported.
> -        */
> -       if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
> -               up_read(&ctx->map_changing_lock);
> -               uffd_mfill_unlock(dst_vma);
> -               return -EINVAL;
> -       }
> -
>         src_addr = src_start;
>         dst_addr = dst_start;
>         copied = 0;
> @@ -686,14 +714,55 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
>                         err = mfill_atomic_pte_zeropage(dst_pmd,
>                                                  dst_vma, dst_addr);
>         } else {
> -               err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
> -                                            dst_addr, src_addr,
> -                                            flags, foliop);
> +               const vm_uffd_ops *uffd_ops = vma_get_uffd_ops(dst_vma);
> +
> +               if (WARN_ON_ONCE(!uffd_ops || !uffd_ops->uffd_copy)) {
> +                       err = -EINVAL;
> +               } else {
> +                       err = uffd_ops->uffd_copy(dst_pmd, dst_vma,
> +                                                 dst_addr, src_addr,
> +                                                 flags, foliop);
> +               }
>         }
>
>         return err;
>  }
>
> +static inline bool
> +vma_uffd_ops_supported(struct vm_area_struct *vma, uffd_flags_t flags)
> +{
> +       enum mfill_atomic_mode mode = uffd_flags_get_mode(flags);
> +       const vm_uffd_ops *uffd_ops;
> +       unsigned long uffd_ioctls;
> +
> +       if ((flags & MFILL_ATOMIC_WP) && !(vma->vm_flags & VM_UFFD_WP))
> +               return false;
> +
> +       /* Anonymous supports everything except CONTINUE */
> +       if (vma_is_anonymous(vma))
> +               return mode != MFILL_ATOMIC_CONTINUE;
> +
> +       uffd_ops = vma_get_uffd_ops(vma);
> +       if (!uffd_ops)
> +               return false;
> +
> +       uffd_ioctls = uffd_ops->uffd_ioctls;
> +       switch (mode) {
> +       case MFILL_ATOMIC_COPY:
> +               return uffd_ioctls & BIT(_UFFDIO_COPY);
> +       case MFILL_ATOMIC_ZEROPAGE:
> +               return uffd_ioctls & BIT(_UFFDIO_ZEROPAGE);
> +       case MFILL_ATOMIC_CONTINUE:
> +               if (!(vma->vm_flags & VM_SHARED))
> +                       return false;
> +               return uffd_ioctls & BIT(_UFFDIO_CONTINUE);
> +       case MFILL_ATOMIC_POISON:
> +               return uffd_ioctls & BIT(_UFFDIO_POISON);
> +       default:
> +               return false;
> +       }
> +}
> +
>  static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>                                             unsigned long dst_start,
>                                             unsigned long src_start,
> @@ -752,11 +821,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>             dst_vma->vm_flags & VM_SHARED))
>                 goto out_unlock;
>
> -       /*
> -        * validate 'mode' now that we know the dst_vma: don't allow
> -        * a wrprotect copy if the userfaultfd didn't register as WP.
> -        */
> -       if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
> +       if (!vma_uffd_ops_supported(dst_vma, flags))
>                 goto out_unlock;
>
>         /*
> @@ -766,12 +831,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>                 return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
>                                              src_start, len, flags);
>
> -       if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> -               goto out_unlock;
> -       if (!vma_is_shmem(dst_vma) &&
> -           uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
> -               goto out_unlock;
> -
>         while (src_addr < src_start + len) {
>                 pmd_t dst_pmdval;
>
> --
> 2.49.0
>

Re: [PATCH 4/4] mm: Apply vm_uffd_ops API to core mm

Posted by Peter Xu 3 months, 2 weeks ago

On Wed, Jun 25, 2025 at 01:31:49PM -0700, James Houghton wrote:
> > -static inline bool vma_can_userfault(struct vm_area_struct *vma,
> > -                                    unsigned long vm_flags,
> > -                                    bool wp_async)
> > +static inline const vm_uffd_ops *vma_get_uffd_ops(struct vm_area_struct *vma)
> >  {
> > -       vm_flags &= __VM_UFFD_FLAGS;
> > -
> > -       if (vma->vm_flags & VM_DROPPABLE)
> > -               return false;
> > -
> > -       if ((vm_flags & VM_UFFD_MINOR) &&
> > -           (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
> > -               return false;
> > -
> > -       /*
> > -        * If wp async enabled, and WP is the only mode enabled, allow any
> > -        * memory type.
> > -        */
> > -       if (wp_async && (vm_flags == VM_UFFD_WP))
> > -               return true;
> > -
> > -#ifndef CONFIG_PTE_MARKER_UFFD_WP
> > -       /*
> > -        * If user requested uffd-wp but not enabled pte markers for
> > -        * uffd-wp, then shmem & hugetlbfs are not supported but only
> > -        * anonymous.
> > -        */
> > -       if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
> > -               return false;
> > -#endif
> 
> Hi Peter,
> 
> Thanks for this cleanup!
> 
> It looks like the above two checks, the wp-async one and the PTE
> marker check, have been reordered in this patch. Does this result in a
> functional difference?
> 
> The rest of this series looks fine to me. :)

Thanks for the very careful review, James!

Yes that's a small tweak I did when moving. I don't expect to have any
functional change.  Maybe I should at least mention that in the commit log.

Here I did the movement because fundamentally wp_async depends on the pte
markers, so it may be slightly more intuitive to check pte markers first,
rejecting any form of file wr-protect traps.  Otherwise it may looks like
we could return the true for wp_async==true too early.  In reality IIUC it
can't happen.

For example, currently userfaultfd_api() has:

#ifndef CONFIG_PTE_MARKER_UFFD_WP
	uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
	uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
	uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
#endif

So when wp_async can be true above, pte markers must be compiled..  IOW,
above code clip should work identically with below lines:

    #ifdef CONFIG_PTE_MARKER_UFFD_WP
       if (wp_async && (vm_flags == VM_UFFD_WP))
            return true;
    #endif

    #ifndef CONFIG_PTE_MARKER_UFFD_WP
        if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
            return false;
    #endif

Then it means both chunks of code cannot be compiled together.  The order
shouldn't matter.

But maybe I should just move it back as before, to save the explain and
confusions.  Let me know if you have any preference.

Thanks,

-- 
Peter Xu

Re: [PATCH 4/4] mm: Apply vm_uffd_ops API to core mm

Posted by James Houghton 3 months, 2 weeks ago

On Wed, Jun 25, 2025 at 2:21 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Wed, Jun 25, 2025 at 01:31:49PM -0700, James Houghton wrote:
> > > -static inline bool vma_can_userfault(struct vm_area_struct *vma,
> > > -                                    unsigned long vm_flags,
> > > -                                    bool wp_async)
> > > +static inline const vm_uffd_ops *vma_get_uffd_ops(struct vm_area_struct *vma)
> > >  {
> > > -       vm_flags &= __VM_UFFD_FLAGS;
> > > -
> > > -       if (vma->vm_flags & VM_DROPPABLE)
> > > -               return false;
> > > -
> > > -       if ((vm_flags & VM_UFFD_MINOR) &&
> > > -           (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
> > > -               return false;
> > > -
> > > -       /*
> > > -        * If wp async enabled, and WP is the only mode enabled, allow any
> > > -        * memory type.
> > > -        */
> > > -       if (wp_async && (vm_flags == VM_UFFD_WP))
> > > -               return true;
> > > -
> > > -#ifndef CONFIG_PTE_MARKER_UFFD_WP
> > > -       /*
> > > -        * If user requested uffd-wp but not enabled pte markers for
> > > -        * uffd-wp, then shmem & hugetlbfs are not supported but only
> > > -        * anonymous.
> > > -        */
> > > -       if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
> > > -               return false;
> > > -#endif
> >
> > Hi Peter,
> >
> > Thanks for this cleanup!
> >
> > It looks like the above two checks, the wp-async one and the PTE
> > marker check, have been reordered in this patch. Does this result in a
> > functional difference?
> >
> > The rest of this series looks fine to me. :)
>
> Thanks for the very careful review, James!
>
> Yes that's a small tweak I did when moving. I don't expect to have any
> functional change.  Maybe I should at least mention that in the commit log.

Yeah if you could leave a small explanation, like what you have below,
in the commit log, that would be good. :)

>
> Here I did the movement because fundamentally wp_async depends on the pte
> markers, so it may be slightly more intuitive to check pte markers first,
> rejecting any form of file wr-protect traps.  Otherwise it may looks like
> we could return the true for wp_async==true too early.  In reality IIUC it
> can't happen.
>
> For example, currently userfaultfd_api() has:
>
> #ifndef CONFIG_PTE_MARKER_UFFD_WP
>         uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
>         uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
>         uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
> #endif

Ah, I see, thanks!

> So when wp_async can be true above, pte markers must be compiled..  IOW,
> above code clip should work identically with below lines:
>
>     #ifdef CONFIG_PTE_MARKER_UFFD_WP
>        if (wp_async && (vm_flags == VM_UFFD_WP))
>             return true;
>     #endif
>
>     #ifndef CONFIG_PTE_MARKER_UFFD_WP
>         if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma))
>             return false;
>     #endif
>
> Then it means both chunks of code cannot be compiled together.  The order
> shouldn't matter.
>
> But maybe I should just move it back as before, to save the explain and
> confusions.  Let me know if you have any preference.

Feel free to leave this patch as it is, that's fine. Thanks for the
explanation. :)

If you'd like, feel free to add:

Reviewed-by: James Houghton <jthoughton@google.com>

Re: [PATCH 4/4] mm: Apply vm_uffd_ops API to core mm

Posted by kernel test robot 3 months, 2 weeks ago

Hi Peter,

kernel test robot noticed the following build warnings:

[auto build test WARNING on next-20250620]
[cannot apply to akpm-mm/mm-everything linus/master v6.16-rc2 v6.16-rc1 v6.15 v6.16-rc2]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Peter-Xu/mm-Introduce-vm_uffd_ops-API/20250621-030557
base:   next-20250620
patch link:    https://lore.kernel.org/r/20250620190342.1780170-5-peterx%40redhat.com
patch subject: [PATCH 4/4] mm: Apply vm_uffd_ops API to core mm
config: i386-randconfig-061-20250622 (https://download.01.org/0day-ci/archive/20250623/202506230216.JVgQj2Si-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250623/202506230216.JVgQj2Si-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202506230216.JVgQj2Si-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
   mm/shmem.c: note: in included file (through include/linux/shmem_fs.h):
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
--
   mm/hugetlb.c:669:12: sparse: sparse: context imbalance in 'allocate_file_region_entries' - wrong count at exit
   mm/hugetlb.c:740:13: sparse: sparse: context imbalance in 'region_add' - wrong count at exit
   mm/hugetlb.c:807:13: sparse: sparse: context imbalance in 'region_chg' - wrong count at exit
   mm/hugetlb.c:5798:20: sparse: sparse: context imbalance in 'move_huge_pte' - different lock contexts for basic block
   mm/hugetlb.c: note: in included file:
   include/linux/mm.h:1391:22: sparse: sparse: context imbalance in 'hugetlb_wp' - unexpected unlock
   mm/hugetlb.c: note: in included file (through include/linux/hugetlb.h, include/linux/migrate.h):
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
--
   mm/userfaultfd.c:270:9: sparse: sparse: context imbalance in 'mfill_atomic_install_pte' - different lock contexts for basic block
   mm/userfaultfd.c:412:9: sparse: sparse: context imbalance in 'mfill_atomic_pte_zeropage' - different lock contexts for basic block
   mm/userfaultfd.c:498:9: sparse: sparse: context imbalance in 'mfill_atomic_pte_poison' - different lock contexts for basic block
   mm/userfaultfd.c: note: in included file:
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
   mm/userfaultfd.c: note: in included file (through include/linux/rbtree.h, include/linux/mm_types.h, include/linux/mmzone.h, ...):
   include/linux/rcupdate.h:871:25: sparse: sparse: context imbalance in 'move_pages_pte' - unexpected unlock

vim +90 include/linux/userfaultfd_k.h

    87	
    88	static inline enum mfill_atomic_mode uffd_flags_get_mode(uffd_flags_t flags)
    89	{
  > 90		return (enum mfill_atomic_mode)(flags & MFILL_ATOMIC_MODE_MASK);
    91	}
    92	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH 4/4] mm: Apply vm_uffd_ops API to core mm

Posted by Peter Xu 3 months, 2 weeks ago

On Mon, Jun 23, 2025 at 03:09:13AM +0800, kernel test robot wrote:
> Hi Peter,
> 
> kernel test robot noticed the following build warnings:
> 
> [auto build test WARNING on next-20250620]
> [cannot apply to akpm-mm/mm-everything linus/master v6.16-rc2 v6.16-rc1 v6.15 v6.16-rc2]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Peter-Xu/mm-Introduce-vm_uffd_ops-API/20250621-030557
> base:   next-20250620
> patch link:    https://lore.kernel.org/r/20250620190342.1780170-5-peterx%40redhat.com
> patch subject: [PATCH 4/4] mm: Apply vm_uffd_ops API to core mm
> config: i386-randconfig-061-20250622 (https://download.01.org/0day-ci/archive/20250623/202506230216.JVgQj2Si-lkp@intel.com/config)
> compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250623/202506230216.JVgQj2Si-lkp@intel.com/reproduce)
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202506230216.JVgQj2Si-lkp@intel.com/
> 
> sparse warnings: (new ones prefixed by >>)
>    mm/shmem.c: note: in included file (through include/linux/shmem_fs.h):
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> --
>    mm/hugetlb.c:669:12: sparse: sparse: context imbalance in 'allocate_file_region_entries' - wrong count at exit
>    mm/hugetlb.c:740:13: sparse: sparse: context imbalance in 'region_add' - wrong count at exit
>    mm/hugetlb.c:807:13: sparse: sparse: context imbalance in 'region_chg' - wrong count at exit
>    mm/hugetlb.c:5798:20: sparse: sparse: context imbalance in 'move_huge_pte' - different lock contexts for basic block
>    mm/hugetlb.c: note: in included file:
>    include/linux/mm.h:1391:22: sparse: sparse: context imbalance in 'hugetlb_wp' - unexpected unlock
>    mm/hugetlb.c: note: in included file (through include/linux/hugetlb.h, include/linux/migrate.h):
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> --
>    mm/userfaultfd.c:270:9: sparse: sparse: context imbalance in 'mfill_atomic_install_pte' - different lock contexts for basic block
>    mm/userfaultfd.c:412:9: sparse: sparse: context imbalance in 'mfill_atomic_pte_zeropage' - different lock contexts for basic block
>    mm/userfaultfd.c:498:9: sparse: sparse: context imbalance in 'mfill_atomic_pte_poison' - different lock contexts for basic block
>    mm/userfaultfd.c: note: in included file:
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
> >> include/linux/userfaultfd_k.h:90:17: sparse: sparse: cast from restricted uffd_flags_t
>    mm/userfaultfd.c: note: in included file (through include/linux/rbtree.h, include/linux/mm_types.h, include/linux/mmzone.h, ...):
>    include/linux/rcupdate.h:871:25: sparse: sparse: context imbalance in 'move_pages_pte' - unexpected unlock
> 
> vim +90 include/linux/userfaultfd_k.h
> 
>     87	
>     88	static inline enum mfill_atomic_mode uffd_flags_get_mode(uffd_flags_t flags)
>     89	{
>   > 90		return (enum mfill_atomic_mode)(flags & MFILL_ATOMIC_MODE_MASK);
>     91	}
>     92	

Sigh.. I thought the cast would helped to tell sparse on this.  Looks like
I need __force..

I'll squash below change when repost:

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 6064f9542d5b..6c5ca68204dd 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -87,7 +87,7 @@ extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);

 static inline enum mfill_atomic_mode uffd_flags_get_mode(uffd_flags_t flags)
 {
-       return (enum mfill_atomic_mode)(flags & MFILL_ATOMIC_MODE_MASK);
+       return (__force enum mfill_atomic_mode)(flags & MFILL_ATOMIC_MODE_MASK);
 }

 static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode expected)

-- 
Peter Xu

[PATCH 1/4] mm: Introduce vm_uffd_ops API
[PATCH 2/4] mm/shmem: Support vm_uffd_ops API
[PATCH 3/4] mm/hugetlb: Support vm_uffd_ops API
[PATCH 4/4] mm: Apply vm_uffd_ops API to core mm