[PATCH v2 03/15] userfaultfd: introduce mfill_get_pmd() helper.

Mike Rapoport posted 15 patches 1 month ago
There is a newer version of this series
[PATCH v2 03/15] userfaultfd: introduce mfill_get_pmd() helper.
Posted by Mike Rapoport 1 month ago
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

There is a lengthy code chunk in mfill_atomic() that establishes the PMD
for UFFDIO operations. This code may be called twice: first time when
the copy is performed with VMA/mm locks held and the other time after
the copy is retried with locks dropped.

Move the code that establishes a PMD into a helper function so it can be
reused later during refactoring of mfill_atomic_pte_copy().

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 mm/userfaultfd.c | 103 ++++++++++++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 50 deletions(-)

diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e68d01743b03..224b55804f99 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -157,6 +157,57 @@ static void uffd_mfill_unlock(struct vm_area_struct *vma)
 }
 #endif
 
+static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+
+	pgd = pgd_offset(mm, address);
+	p4d = p4d_alloc(mm, pgd, address);
+	if (!p4d)
+		return NULL;
+	pud = pud_alloc(mm, p4d, address);
+	if (!pud)
+		return NULL;
+	/*
+	 * Note that we didn't run this because the pmd was
+	 * missing, the *pmd may be already established and in
+	 * turn it may also be a trans_huge_pmd.
+	 */
+	return pmd_alloc(mm, pud, address);
+}
+
+static int mfill_get_pmd(struct mfill_state *state)
+{
+	struct mm_struct *dst_mm = state->ctx->mm;
+	pmd_t *dst_pmd;
+	pmd_t dst_pmdval;
+
+	dst_pmd = mm_alloc_pmd(dst_mm, state->dst_addr);
+	if (unlikely(!dst_pmd))
+		return -ENOMEM;
+
+	dst_pmdval = pmdp_get_lockless(dst_pmd);
+	if (unlikely(pmd_none(dst_pmdval)) &&
+	    unlikely(__pte_alloc(dst_mm, dst_pmd)))
+		return -ENOMEM;
+
+	dst_pmdval = pmdp_get_lockless(dst_pmd);
+	/*
+	 * If the dst_pmd is THP don't override it and just be strict.
+	 * (This includes the case where the PMD used to be THP and
+	 * changed back to none after __pte_alloc().)
+	 */
+	if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval)))
+		return -EEXIST;
+	if (unlikely(pmd_bad(dst_pmdval)))
+		return -EFAULT;
+
+	state->pmd = dst_pmd;
+	return 0;
+}
+
 /* Check if dst_addr is outside of file's size. Must be called with ptl held. */
 static bool mfill_file_over_size(struct vm_area_struct *dst_vma,
 				 unsigned long dst_addr)
@@ -489,27 +540,6 @@ static int mfill_atomic_pte_poison(struct mfill_state *state)
 	return ret;
 }
 
-static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
-{
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
-
-	pgd = pgd_offset(mm, address);
-	p4d = p4d_alloc(mm, pgd, address);
-	if (!p4d)
-		return NULL;
-	pud = pud_alloc(mm, p4d, address);
-	if (!pud)
-		return NULL;
-	/*
-	 * Note that we didn't run this because the pmd was
-	 * missing, the *pmd may be already established and in
-	 * turn it may also be a trans_huge_pmd.
-	 */
-	return pmd_alloc(mm, pud, address);
-}
-
 #ifdef CONFIG_HUGETLB_PAGE
 /*
  * mfill_atomic processing for HUGETLB vmas.  Note that this routine is
@@ -742,7 +772,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	struct vm_area_struct *dst_vma;
 	long copied = 0;
 	ssize_t err;
-	pmd_t *dst_pmd;
 
 	/*
 	 * Sanitize the command parameters:
@@ -809,41 +838,15 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	while (state.src_addr < src_start + len) {
 		VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);
 
-		pmd_t dst_pmdval;
-
-		dst_pmd = mm_alloc_pmd(dst_mm, state.dst_addr);
-		if (unlikely(!dst_pmd)) {
-			err = -ENOMEM;
+		err = mfill_get_pmd(&state);
+		if (err)
 			break;
-		}
 
-		dst_pmdval = pmdp_get_lockless(dst_pmd);
-		if (unlikely(pmd_none(dst_pmdval)) &&
-		    unlikely(__pte_alloc(dst_mm, dst_pmd))) {
-			err = -ENOMEM;
-			break;
-		}
-		dst_pmdval = pmdp_get_lockless(dst_pmd);
-		/*
-		 * If the dst_pmd is THP don't override it and just be strict.
-		 * (This includes the case where the PMD used to be THP and
-		 * changed back to none after __pte_alloc().)
-		 */
-		if (unlikely(!pmd_present(dst_pmdval) ||
-				pmd_trans_huge(dst_pmdval))) {
-			err = -EEXIST;
-			break;
-		}
-		if (unlikely(pmd_bad(dst_pmdval))) {
-			err = -EFAULT;
-			break;
-		}
 		/*
 		 * For shmem mappings, khugepaged is allowed to remove page
 		 * tables under us; pte_offset_map_lock() will deal with that.
 		 */
 
-		state.pmd = dst_pmd;
 		err = mfill_atomic_pte(&state);
 		cond_resched();
 
-- 
2.51.0
Re: [PATCH v2 03/15] userfaultfd: introduce mfill_get_pmd() helper.
Posted by David Hildenbrand (Arm) 2 weeks, 6 days ago
On 3/6/26 18:18, Mike Rapoport wrote:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

Nit: "." at the end of the patch subject

> 
> There is a lengthy code chunk in mfill_atomic() that establishes the PMD
> for UFFDIO operations. This code may be called twice: first time when
> the copy is performed with VMA/mm locks held and the other time after
> the copy is retried with locks dropped.
> 
> Move the code that establishes a PMD into a helper function so it can be
> reused later during refactoring of mfill_atomic_pte_copy().
> 
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---
>  mm/userfaultfd.c | 103 ++++++++++++++++++++++++-----------------------
>  1 file changed, 53 insertions(+), 50 deletions(-)
> 
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index e68d01743b03..224b55804f99 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -157,6 +157,57 @@ static void uffd_mfill_unlock(struct vm_area_struct *vma)
>  }
>  #endif
>  
> +static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
> +{
> +	pgd_t *pgd;
> +	p4d_t *p4d;
> +	pud_t *pud;
> +
> +	pgd = pgd_offset(mm, address);
> +	p4d = p4d_alloc(mm, pgd, address);
> +	if (!p4d)
> +		return NULL;
> +	pud = pud_alloc(mm, p4d, address);
> +	if (!pud)
> +		return NULL;
> +	/*
> +	 * Note that we didn't run this because the pmd was
> +	 * missing, the *pmd may be already established and in
> +	 * turn it may also be a trans_huge_pmd.
> +	 */
> +	return pmd_alloc(mm, pud, address);
> +}
> +
> +static int mfill_get_pmd(struct mfill_state *state)
> +{
> +	struct mm_struct *dst_mm = state->ctx->mm;
> +	pmd_t *dst_pmd;
> +	pmd_t dst_pmdval;

I'd just have both on a single line.

> +
> +	dst_pmd = mm_alloc_pmd(dst_mm, state->dst_addr);
> +	if (unlikely(!dst_pmd))
> +		return -ENOMEM;
> +
> +	dst_pmdval = pmdp_get_lockless(dst_pmd);
> +	if (unlikely(pmd_none(dst_pmdval)) &&
> +	    unlikely(__pte_alloc(dst_mm, dst_pmd)))
> +		return -ENOMEM;
> +
> +	dst_pmdval = pmdp_get_lockless(dst_pmd);
> +	/*
> +	 * If the dst_pmd is THP don't override it and just be strict.
> +	 * (This includes the case where the PMD used to be THP and
> +	 * changed back to none after __pte_alloc().)
> +	 */
> +	if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval)))

Can we directly switch to pmd_leaf() while touching that?

> +		return -EEXIST;
> +	if (unlikely(pmd_bad(dst_pmdval)))
> +		return -EFAULT;
> +
> +	state->pmd = dst_pmd;
> +	return 0;
> +}


[...]

>  	/*
>  	 * Sanitize the command parameters:
> @@ -809,41 +838,15 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  	while (state.src_addr < src_start + len) {
>  		VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);
>  
> -		pmd_t dst_pmdval;
> -
> -		dst_pmd = mm_alloc_pmd(dst_mm, state.dst_addr);
> -		if (unlikely(!dst_pmd)) {
> -			err = -ENOMEM;
> +		err = mfill_get_pmd(&state);
> +		if (err)


It's a bit odd that a "get" function doesn't return a PMD pointer but
instead stores it in the state.

Maybe more like "mfill_prepare_pmd" ? But actually you want to have a
pte table.

mfill_prepare_pte_table() or alternatively mfill_alloc_pte_table() /
mfill_alloc_dst_pte_table()

-- 
Cheers,

David
Re: [PATCH v2 03/15] userfaultfd: introduce mfill_get_pmd() helper.
Posted by Mike Rapoport 2 weeks, 4 days ago
On Fri, Mar 20, 2026 at 01:55:29PM +0100, David Hildenbrand (Arm) wrote:
> On 3/6/26 18:18, Mike Rapoport wrote:
> > From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> 
> Nit: "." at the end of the patch subject
 
Oops :/
 
> > +static int mfill_get_pmd(struct mfill_state *state)
> > +{
> > +	struct mm_struct *dst_mm = state->ctx->mm;
> > +	pmd_t *dst_pmd;
> > +	pmd_t dst_pmdval;
> 
> I'd just have both on a single line.

Can do :) 

> > +	/*
> > +	 * If the dst_pmd is THP don't override it and just be strict.
> > +	 * (This includes the case where the PMD used to be THP and
> > +	 * changed back to none after __pte_alloc().)
> > +	 */
> > +	if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval)))
> 
> Can we directly switch to pmd_leaf() while touching that?

You mean instead of pmd_trans_huge()?
Yeah, sure.

> > @@ -809,41 +838,15 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >  	while (state.src_addr < src_start + len) {
> >  		VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);
> >  
> > -		pmd_t dst_pmdval;
> > -
> > -		dst_pmd = mm_alloc_pmd(dst_mm, state.dst_addr);
> > -		if (unlikely(!dst_pmd)) {
> > -			err = -ENOMEM;
> > +		err = mfill_get_pmd(&state);
> > +		if (err)
> 
> 
> It's a bit odd that a "get" function doesn't return a PMD pointer but
> instead stores it in the state.
> 
> Maybe more like "mfill_prepare_pmd" ? But actually you want to have a
> pte table.
> 
> mfill_prepare_pte_table() or alternatively mfill_alloc_pte_table() /
> mfill_alloc_dst_pte_table()

As it actually allocates the pte table once in 512 times, I'd prefer
mfill_establish_pmd().
 
> -- 
> Cheers,
> David

-- 
Sincerely yours,
Mike.
Re: [PATCH v2 03/15] userfaultfd: introduce mfill_get_pmd() helper.
Posted by David Hildenbrand (Arm) 1 week, 3 days ago
>>
>> mfill_prepare_pte_table() or alternatively mfill_alloc_pte_table() /
>> mfill_alloc_dst_pte_table()
> 
> As it actually allocates the pte table once in 512 times, I'd prefer
> mfill_establish_pmd().

Replying only now (spotted v3 in my inbox)m sounds good to me.

-- 
Cheers,

David