[v3] mm, kvm: allow uffd support in guest_memfd

[PATCH v3 02/15] userfaultfd: introduce struct mfill_state

Posted by Mike Rapoport 1 week ago

From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

mfill_atomic() passes a lot of parameters down to its callees.

Aggregate them all into mfill_state structure and pass this structure to
functions that implement various UFFDIO_ commands.

Tracking the state in a structure will allow moving the code that retries
copying of data for UFFDIO_COPY into mfill_atomic_pte_copy() and make the
loop in mfill_atomic() identical for all UFFDIO operations on PTE-mapped
memory.

The mfill_state definition is deliberately local to mm/userfaultfd.c,
hence shmem_mfill_atomic_pte() is not updated.

[harry.yoo@oracle.com: properly initialize mfill_state.len to fix
                       folio_add_new_anon_rmap() WARN]
Link: https://lkml.kernel.org/r/abehBY7QakYF9bK4@hyeyoo
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
---
 mm/userfaultfd.c | 148 ++++++++++++++++++++++++++---------------------
 1 file changed, 82 insertions(+), 66 deletions(-)

diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 32637d557c95..fa9622ec7279 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -20,6 +20,20 @@
 #include "internal.h"
 #include "swap.h"
 
+struct mfill_state {
+	struct userfaultfd_ctx *ctx;
+	unsigned long src_start;
+	unsigned long dst_start;
+	unsigned long len;
+	uffd_flags_t flags;
+
+	struct vm_area_struct *vma;
+	unsigned long src_addr;
+	unsigned long dst_addr;
+	struct folio *folio;
+	pmd_t *pmd;
+};
+
 static __always_inline
 bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
 {
@@ -272,17 +286,17 @@ static int mfill_copy_folio_locked(struct folio *folio, unsigned long src_addr)
 	return ret;
 }
 
-static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
-				 struct vm_area_struct *dst_vma,
-				 unsigned long dst_addr,
-				 unsigned long src_addr,
-				 uffd_flags_t flags,
-				 struct folio **foliop)
+static int mfill_atomic_pte_copy(struct mfill_state *state)
 {
-	int ret;
+	struct vm_area_struct *dst_vma = state->vma;
+	unsigned long dst_addr = state->dst_addr;
+	unsigned long src_addr = state->src_addr;
+	uffd_flags_t flags = state->flags;
+	pmd_t *dst_pmd = state->pmd;
 	struct folio *folio;
+	int ret;
 
-	if (!*foliop) {
+	if (!state->folio) {
 		ret = -ENOMEM;
 		folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
 					dst_addr);
@@ -294,13 +308,13 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
 		/* fallback to copy_from_user outside mmap_lock */
 		if (unlikely(ret)) {
 			ret = -ENOENT;
-			*foliop = folio;
+			state->folio = folio;
 			/* don't free the page */
 			goto out;
 		}
 	} else {
-		folio = *foliop;
-		*foliop = NULL;
+		folio = state->folio;
+		state->folio = NULL;
 	}
 
 	/*
@@ -357,10 +371,11 @@ static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
 	return ret;
 }
 
-static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
-				     struct vm_area_struct *dst_vma,
-				     unsigned long dst_addr)
+static int mfill_atomic_pte_zeropage(struct mfill_state *state)
 {
+	struct vm_area_struct *dst_vma = state->vma;
+	unsigned long dst_addr = state->dst_addr;
+	pmd_t *dst_pmd = state->pmd;
 	pte_t _dst_pte, *dst_pte;
 	spinlock_t *ptl;
 	int ret;
@@ -392,13 +407,14 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
 }
 
 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
-static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
-				     struct vm_area_struct *dst_vma,
-				     unsigned long dst_addr,
-				     uffd_flags_t flags)
+static int mfill_atomic_pte_continue(struct mfill_state *state)
 {
-	struct inode *inode = file_inode(dst_vma->vm_file);
+	struct vm_area_struct *dst_vma = state->vma;
+	unsigned long dst_addr = state->dst_addr;
 	pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+	struct inode *inode = file_inode(dst_vma->vm_file);
+	uffd_flags_t flags = state->flags;
+	pmd_t *dst_pmd = state->pmd;
 	struct folio *folio;
 	struct page *page;
 	int ret;
@@ -436,15 +452,15 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
 }
 
 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
-static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
-				   struct vm_area_struct *dst_vma,
-				   unsigned long dst_addr,
-				   uffd_flags_t flags)
+static int mfill_atomic_pte_poison(struct mfill_state *state)
 {
-	int ret;
+	struct vm_area_struct *dst_vma = state->vma;
 	struct mm_struct *dst_mm = dst_vma->vm_mm;
+	unsigned long dst_addr = state->dst_addr;
+	pmd_t *dst_pmd = state->pmd;
 	pte_t _dst_pte, *dst_pte;
 	spinlock_t *ptl;
+	int ret;
 
 	_dst_pte = make_pte_marker(PTE_MARKER_POISONED);
 	ret = -EAGAIN;
@@ -668,22 +684,20 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
 				    uffd_flags_t flags);
 #endif /* CONFIG_HUGETLB_PAGE */
 
-static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
-						struct vm_area_struct *dst_vma,
-						unsigned long dst_addr,
-						unsigned long src_addr,
-						uffd_flags_t flags,
-						struct folio **foliop)
+static __always_inline ssize_t mfill_atomic_pte(struct mfill_state *state)
 {
+	struct vm_area_struct *dst_vma = state->vma;
+	unsigned long src_addr = state->src_addr;
+	unsigned long dst_addr = state->dst_addr;
+	struct folio **foliop = &state->folio;
+	uffd_flags_t flags = state->flags;
+	pmd_t *dst_pmd = state->pmd;
 	ssize_t err;
 
-	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
-		return mfill_atomic_pte_continue(dst_pmd, dst_vma,
-						 dst_addr, flags);
-	} else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
-		return mfill_atomic_pte_poison(dst_pmd, dst_vma,
-					       dst_addr, flags);
-	}
+	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
+		return mfill_atomic_pte_continue(state);
+	if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON))
+		return mfill_atomic_pte_poison(state);
 
 	/*
 	 * The normal page fault path for a shmem will invoke the
@@ -697,12 +711,9 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
 	 */
 	if (!(dst_vma->vm_flags & VM_SHARED)) {
 		if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
-			err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
-						    dst_addr, src_addr,
-						    flags, foliop);
+			err = mfill_atomic_pte_copy(state);
 		else
-			err = mfill_atomic_pte_zeropage(dst_pmd,
-						 dst_vma, dst_addr);
+			err = mfill_atomic_pte_zeropage(state);
 	} else {
 		err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
 					     dst_addr, src_addr,
@@ -718,13 +729,20 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 					    unsigned long len,
 					    uffd_flags_t flags)
 {
+	struct mfill_state state = (struct mfill_state){
+		.ctx = ctx,
+		.dst_start = dst_start,
+		.src_start = src_start,
+		.flags = flags,
+		.len = len,
+		.src_addr = src_start,
+		.dst_addr = dst_start,
+	};
 	struct mm_struct *dst_mm = ctx->mm;
 	struct vm_area_struct *dst_vma;
+	long copied = 0;
 	ssize_t err;
 	pmd_t *dst_pmd;
-	unsigned long src_addr, dst_addr;
-	long copied;
-	struct folio *folio;
 
 	/*
 	 * Sanitize the command parameters:
@@ -736,10 +754,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	VM_WARN_ON_ONCE(src_start + len <= src_start);
 	VM_WARN_ON_ONCE(dst_start + len <= dst_start);
 
-	src_addr = src_start;
-	dst_addr = dst_start;
-	copied = 0;
-	folio = NULL;
 retry:
 	/*
 	 * Make sure the vma is not shared, that the dst range is
@@ -790,12 +804,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
 		goto out_unlock;
 
-	while (src_addr < src_start + len) {
-		pmd_t dst_pmdval;
+	state.vma = dst_vma;
 
-		VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
+	while (state.src_addr < src_start + len) {
+		VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);
+
+		pmd_t dst_pmdval;
 
-		dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
+		dst_pmd = mm_alloc_pmd(dst_mm, state.dst_addr);
 		if (unlikely(!dst_pmd)) {
 			err = -ENOMEM;
 			break;
@@ -827,34 +843,34 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 		 * tables under us; pte_offset_map_lock() will deal with that.
 		 */
 
-		err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
-				       src_addr, flags, &folio);
+		state.pmd = dst_pmd;
+		err = mfill_atomic_pte(&state);
 		cond_resched();
 
 		if (unlikely(err == -ENOENT)) {
 			void *kaddr;
 
 			up_read(&ctx->map_changing_lock);
-			uffd_mfill_unlock(dst_vma);
-			VM_WARN_ON_ONCE(!folio);
+			uffd_mfill_unlock(state.vma);
+			VM_WARN_ON_ONCE(!state.folio);
 
-			kaddr = kmap_local_folio(folio, 0);
+			kaddr = kmap_local_folio(state.folio, 0);
 			err = copy_from_user(kaddr,
-					     (const void __user *) src_addr,
+					     (const void __user *)state.src_addr,
 					     PAGE_SIZE);
 			kunmap_local(kaddr);
 			if (unlikely(err)) {
 				err = -EFAULT;
 				goto out;
 			}
-			flush_dcache_folio(folio);
+			flush_dcache_folio(state.folio);
 			goto retry;
 		} else
-			VM_WARN_ON_ONCE(folio);
+			VM_WARN_ON_ONCE(state.folio);
 
 		if (!err) {
-			dst_addr += PAGE_SIZE;
-			src_addr += PAGE_SIZE;
+			state.dst_addr += PAGE_SIZE;
+			state.src_addr += PAGE_SIZE;
 			copied += PAGE_SIZE;
 
 			if (fatal_signal_pending(current))
@@ -866,10 +882,10 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 
 out_unlock:
 	up_read(&ctx->map_changing_lock);
-	uffd_mfill_unlock(dst_vma);
+	uffd_mfill_unlock(state.vma);
 out:
-	if (folio)
-		folio_put(folio);
+	if (state.folio)
+		folio_put(state.folio);
 	VM_WARN_ON_ONCE(copied < 0);
 	VM_WARN_ON_ONCE(err > 0);
 	VM_WARN_ON_ONCE(!copied && !err);
-- 
2.53.0

Re: [PATCH v3 02/15] userfaultfd: introduce struct mfill_state

Posted by Harry Yoo (Oracle) 6 days, 3 hours ago

On Mon, Mar 30, 2026 at 01:11:03PM +0300, Mike Rapoport wrote:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> 
> mfill_atomic() passes a lot of parameters down to its callees.
> 
> Aggregate them all into mfill_state structure and pass this structure to
> functions that implement various UFFDIO_ commands.
> 
> Tracking the state in a structure will allow moving the code that retries
> copying of data for UFFDIO_COPY into mfill_atomic_pte_copy() and make the
> loop in mfill_atomic() identical for all UFFDIO operations on PTE-mapped
> memory.
> 
> The mfill_state definition is deliberately local to mm/userfaultfd.c,
> hence shmem_mfill_atomic_pte() is not updated.
> 
> [harry.yoo@oracle.com: properly initialize mfill_state.len to fix
>                        folio_add_new_anon_rmap() WARN]
> Link: https://lkml.kernel.org/r/abehBY7QakYF9bK4@hyeyoo
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
> Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> ---
>  mm/userfaultfd.c | 148 ++++++++++++++++++++++++++---------------------
>  1 file changed, 82 insertions(+), 66 deletions(-)
> 
> @@ -790,12 +804,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
>  		goto out_unlock;
>  
> -	while (src_addr < src_start + len) {
> -		pmd_t dst_pmdval;
> +	state.vma = dst_vma;

Oh wait, the lock leak was introduced in patch 2.
If there's an error between uffd_mfill_lock() and `state.vma = dst_vma`,
it remains unlocked.

Probably should have been fixed in 2, not patch 4...
Sorry didn't realize it earlier.

> -		VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
> +	while (state.src_addr < src_start + len) {
> +		VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);
> +
> +		pmd_t dst_pmdval;
>  
> -		dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
> +		dst_pmd = mm_alloc_pmd(dst_mm, state.dst_addr);
>  		if (unlikely(!dst_pmd)) {
>  			err = -ENOMEM;
>  			break;
> @@ -866,10 +882,10 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>  
>  out_unlock:
>  	up_read(&ctx->map_changing_lock);
> -	uffd_mfill_unlock(dst_vma);
> +	uffd_mfill_unlock(state.vma);
>  out:
> -	if (folio)
> -		folio_put(folio);
> +	if (state.folio)
> +		folio_put(state.folio);

Sashiko raised a concern [2] that it the VMA might be unmapped and
a new mapping created as a uffd hugetlb vma and leak the folio by
going through

`if (is_vm_hugetlb_page(dst_vma))
        return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
                                    src_start, len, flags);`

but it appears to be a false positive (to me) because

`if (atomic_read(&ctx->mmap_changing))` check should have detected unmapping
and free the folio?

[2] https://sashiko.dev/#/patchset/20260330101116.1117699-1-rppt%40kernel.org?patch=13671

>  	VM_WARN_ON_ONCE(copied < 0);
>  	VM_WARN_ON_ONCE(err > 0);
>  	VM_WARN_ON_ONCE(!copied && !err);

Otherwise looks correct to me.

-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH v3 02/15] userfaultfd: introduce struct mfill_state

Posted by Mike Rapoport 5 days, 20 hours ago

Hi Harry,

On Tue, Mar 31, 2026 at 04:03:13PM +0900, Harry Yoo (Oracle) wrote:
> On Mon, Mar 30, 2026 at 01:11:03PM +0300, Mike Rapoport wrote:
> > From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> > 
> > mfill_atomic() passes a lot of parameters down to its callees.
> > 
> > Aggregate them all into mfill_state structure and pass this structure to
> > functions that implement various UFFDIO_ commands.
> > 
> > Tracking the state in a structure will allow moving the code that retries
> > copying of data for UFFDIO_COPY into mfill_atomic_pte_copy() and make the
> > loop in mfill_atomic() identical for all UFFDIO operations on PTE-mapped
> > memory.
> > 
> > The mfill_state definition is deliberately local to mm/userfaultfd.c,
> > hence shmem_mfill_atomic_pte() is not updated.
> > 
> > [harry.yoo@oracle.com: properly initialize mfill_state.len to fix
> >                        folio_add_new_anon_rmap() WARN]
> > Link: https://lkml.kernel.org/r/abehBY7QakYF9bK4@hyeyoo
> > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> > Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
> > Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> > ---
> >  mm/userfaultfd.c | 148 ++++++++++++++++++++++++++---------------------
> >  1 file changed, 82 insertions(+), 66 deletions(-)
> > 
> > @@ -790,12 +804,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >  	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
> >  		goto out_unlock;
> >  
> > -	while (src_addr < src_start + len) {
> > -		pmd_t dst_pmdval;
> > +	state.vma = dst_vma;
> 
> Oh wait, the lock leak was introduced in patch 2.

Lock leak was introduced in patch 4 that moved getting the vma.
Patch 2 missed the assignment of state.len and introduced an issue with
bound checks.

> If there's an error between uffd_mfill_lock() and `state.vma = dst_vma`,
> it remains unlocked.
> 
> Probably should have been fixed in 2, not patch 4...
> Sorry didn't realize it earlier.
> 
> > -		VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
> > +	while (state.src_addr < src_start + len) {
> > +		VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);
> > +
> > +		pmd_t dst_pmdval;
> >  
> > -		dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
> > +		dst_pmd = mm_alloc_pmd(dst_mm, state.dst_addr);
> >  		if (unlikely(!dst_pmd)) {
> >  			err = -ENOMEM;
> >  			break;
> > @@ -866,10 +882,10 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >  
> >  out_unlock:
> >  	up_read(&ctx->map_changing_lock);
> > -	uffd_mfill_unlock(dst_vma);
> > +	uffd_mfill_unlock(state.vma);
> >  out:
> > -	if (folio)
> > -		folio_put(folio);
> > +	if (state.folio)
> > +		folio_put(state.folio);
> 
> Sashiko raised a concern [2] that it the VMA might be unmapped and
> a new mapping created as a uffd hugetlb vma and leak the folio by
> going through
> 
> `if (is_vm_hugetlb_page(dst_vma))
>         return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
>                                     src_start, len, flags);`
> 
> but it appears to be a false positive (to me) because
> 
> `if (atomic_read(&ctx->mmap_changing))` check should have detected unmapping
> and free the folio?

I think it's real, and it's there more or less from the beginning, although
nobody hit it yet :)

Before retrying the copy we drop all the locks, so if the copy is really
long the old mapping can be wiped and a new mapping can be created instead.

There's already a v4 of a patch that attempts to solve this:

https://lore.kernel.org/all/20260331134158.622084-1-devnexen@gmail.com
 
> [2] https://sashiko.dev/#/patchset/20260330101116.1117699-1-rppt%40kernel.org?patch=13671
> 
> >  	VM_WARN_ON_ONCE(copied < 0);
> >  	VM_WARN_ON_ONCE(err > 0);
> >  	VM_WARN_ON_ONCE(!copied && !err);
> 
> Otherwise looks correct to me.
> 
> -- 
> Cheers,
> Harry / Hyeonggon

-- 
Sincerely yours,
Mike.

Re: [PATCH v3 02/15] userfaultfd: introduce struct mfill_state

Posted by Harry Yoo (Oracle) 5 days, 19 hours ago

On Tue, Mar 31, 2026 at 05:32:28PM +0300, Mike Rapoport wrote:
> Hi Harry,
> 
> On Tue, Mar 31, 2026 at 04:03:13PM +0900, Harry Yoo (Oracle) wrote:
> > On Mon, Mar 30, 2026 at 01:11:03PM +0300, Mike Rapoport wrote:
> > > From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> > > 
> > > mfill_atomic() passes a lot of parameters down to its callees.
> > > 
> > > Aggregate them all into mfill_state structure and pass this structure to
> > > functions that implement various UFFDIO_ commands.
> > > 
> > > Tracking the state in a structure will allow moving the code that retries
> > > copying of data for UFFDIO_COPY into mfill_atomic_pte_copy() and make the
> > > loop in mfill_atomic() identical for all UFFDIO operations on PTE-mapped
> > > memory.
> > > 
> > > The mfill_state definition is deliberately local to mm/userfaultfd.c,
> > > hence shmem_mfill_atomic_pte() is not updated.
> > > 
> > > [harry.yoo@oracle.com: properly initialize mfill_state.len to fix
> > >                        folio_add_new_anon_rmap() WARN]
> > > Link: https://lkml.kernel.org/r/abehBY7QakYF9bK4@hyeyoo
> > > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> > > Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
> > > Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> > > ---
> > >  mm/userfaultfd.c | 148 ++++++++++++++++++++++++++---------------------
> > >  1 file changed, 82 insertions(+), 66 deletions(-)
> > > 
> > > @@ -790,12 +804,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >  	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
> > >  		goto out_unlock;
> > >  
> > > -	while (src_addr < src_start + len) {
> > > -		pmd_t dst_pmdval;
> > > +	state.vma = dst_vma;
> > 
> > Oh wait, the lock leak was introduced in patch 2.
> 
> Lock leak was introduced in patch 4 that moved getting the vma.

Still not sure what I could possibly be missing, so let me try again.
when I check out to this commit "userfaultfd: introduce struct mfill_state" I see:
| static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
|                                             unsigned long dst_start,
|                                             unsigned long src_start,
|                                             unsigned long len,
|                                             uffd_flags_t flags)
| {
|         struct mfill_state state = (struct mfill_state){
|                 .ctx = ctx,
|                 .dst_start = dst_start,
|                 .src_start = src_start, .flags = flags,
|                 .len = len,
|                 .src_addr = src_start,
|                 .dst_addr = dst_start,
|         };
|

[ ...snip...]

| retry:
|         /*
|          * Make sure the vma is not shared, that the dst range is
|          * both valid and fully within a single existing vma.
|          */
|         dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);

It acquires the vma lock (or mmap_lock) here, but doesn't set state.vma.

|         if (IS_ERR(dst_vma)) {
|                 err = PTR_ERR(dst_vma);
|                 goto out;
|         }
| 
|         /*
|          * If memory mappings are changing because of non-cooperative
|          * operation (e.g. mremap) running in parallel, bail out and
|          * request the user to retry later
|          */
|         down_read(&ctx->map_changing_lock);
|         err = -EAGAIN;
|         if (atomic_read(&ctx->mmap_changing))
|                 goto out_unlock;
| 
|         err = -EINVAL;
|         /*
|          * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
|          * it will overwrite vm_ops, so vma_is_anonymous must return false.
|          */
|         if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
|             dst_vma->vm_flags & VM_SHARED))
| 
|         /*
|          * validate 'mode' now that we know the dst_vma: don't allow
|          * a wrprotect copy if the userfaultfd didn't register as WP.
|          */
|         if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
|                 goto out_unlock;
|
|         /*
|          * If this is a HUGETLB vma, pass off to appropriate routine
|          */
|         if (is_vm_hugetlb_page(dst_vma))
|                 return  mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
|                                              src_start, len, flags);
|
|         if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
|                 goto out_unlock;
|         if (!vma_is_shmem(dst_vma) &&
|             uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
|                 goto out_unlock;
|
|         state.vma = dst_vma;

It is set here. So if anything before this jumps to `out_unlock`
label due to a sanity check,

[...]

|         while (state.src_addr < src_start + len) {                              
|                 VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);             
|                                                                                 
|                 pmd_t dst_pmdval;         
| [...]
| 
| out_unlock:
|         up_read(&ctx->map_changing_lock);
|         uffd_mfill_unlock(state.vma);

the `vma` parameter will be NULL?

If I'm not missing something this is introduced in patch 2 and
fixed in patch 4.

| out:
|         if (state.folio)
|                 folio_put(state.folio);
|         VM_WARN_ON_ONCE(copied < 0);
|         VM_WARN_ON_ONCE(err > 0);
|         VM_WARN_ON_ONCE(!copied && !err);
|         return copied ? copied : err;
| }

> > > @@ -866,10 +882,10 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >  
> > >  out_unlock:
> > >  	up_read(&ctx->map_changing_lock);
> > > -	uffd_mfill_unlock(dst_vma);
> > > +	uffd_mfill_unlock(state.vma);
> > >  out:
> > > -	if (folio)
> > > -		folio_put(folio);
> > > +	if (state.folio)
> > > +		folio_put(state.folio);
> > 
> > Sashiko raised a concern [2] that it the VMA might be unmapped and
> > a new mapping created as a uffd hugetlb vma and leak the folio by
> > going through
> > 
> > `if (is_vm_hugetlb_page(dst_vma))
> >         return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> >                                     src_start, len, flags);`
> > 
> > but it appears to be a false positive (to me) because
> > 
> > `if (atomic_read(&ctx->mmap_changing))` check should have detected unmapping
> > and free the folio?
> 
> I think it's real, and it's there more or less from the beginning, although
> nobody hit it yet :)
> 
> Before retrying the copy we drop all the locks, so if the copy is really
> long the old mapping can be wiped and a new mapping can be created instead.

Oops, perhaps I should have imagined harder :)

> There's already a v4 of a patch that attempts to solve this:
> 
> https://lore.kernel.org/all/20260331134158.622084-1-devnexen@gmail.com

Thanks for the pointer!

> > [2] https://sashiko.dev/#/patchset/20260330101116.1117699-1-rppt%40kernel.org?patch=13671
> > 
> > >  	VM_WARN_ON_ONCE(copied < 0);
> > >  	VM_WARN_ON_ONCE(err > 0);
> > >  	VM_WARN_ON_ONCE(!copied && !err);

-- 
Cheers,
Harry / Hyeonggon

Re: [PATCH v3 02/15] userfaultfd: introduce struct mfill_state

Posted by Mike Rapoport 5 days, 2 hours ago

On Wed, Apr 01, 2026 at 12:24:01AM +0900, Harry Yoo (Oracle) wrote:
> On Tue, Mar 31, 2026 at 05:32:28PM +0300, Mike Rapoport wrote:
> |         /*
> |          * Make sure the vma is not shared, that the dst range is
> |          * both valid and fully within a single existing vma.
> |          */
> |         dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
> 
> It acquires the vma lock (or mmap_lock) here, but doesn't set state.vma.
> 
> |         if (IS_ERR(dst_vma)) {
> |                 err = PTR_ERR(dst_vma);
> |                 goto out;
> |         }

...

> |         if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> |                 goto out_unlock;
> |         if (!vma_is_shmem(dst_vma) &&
> |             uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
> |                 goto out_unlock;
> |
> |         state.vma = dst_vma;
> 
> It is set here. So if anything before this jumps to `out_unlock`
> label due to a sanity check,
> 
> [...]
> 
> |         while (state.src_addr < src_start + len) {                              
> |                 VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);             
> |                                                                                 
> |                 pmd_t dst_pmdval;         
> | [...]
> | 
> | out_unlock:
> |         up_read(&ctx->map_changing_lock);
> |         uffd_mfill_unlock(state.vma);
> 
> the `vma` parameter will be NULL?
> 
> If I'm not missing something this is introduced in patch 2 and
> fixed in patch 4.

You are right.
Here's a fixup (it causes a conflict in patch 4 though).
Andrew, I can send v4 if you prefer.

diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index fa9622ec7279..c4074b6f4aca 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -764,6 +764,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 		err = PTR_ERR(dst_vma);
 		goto out;
 	}
+	state.vma = dst_vma;
 
 	/*
 	 * If memory mappings are changing because of non-cooperative
@@ -804,8 +805,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
 	    uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
 		goto out_unlock;
 
-	state.vma = dst_vma;
-
 	while (state.src_addr < src_start + len) {
 		VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);
 
 

-- 
Sincerely yours,
Mike.

Re: [PATCH v3 02/15] userfaultfd: introduce struct mfill_state

Posted by Andrew Morton 4 days, 16 hours ago

On Wed, 1 Apr 2026 10:36:03 +0300 Mike Rapoport <rppt@kernel.org> wrote:

> Here's a fixup (it causes a conflict in patch 4 though).
> Andrew, I can send v4 if you prefer.

Looks pretty simple.  "userfaultfd: introduce mfill_get_vma() and
mfill_put_vma()" now leaves things like this:

retry:
	err = mfill_get_vma(&state);
	if (err)
		goto out;
	state.vma = dst_vma;

	/*
	 * If this is a HUGETLB vma, pass off to appropriate routine
	 */
	if (is_vm_hugetlb_page(state.vma))
		return mfill_atomic_hugetlb(ctx, state.vma, dst_start,
					    src_start, len, flags);

	while (state.src_addr < src_start + len) {
		VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);

		err = mfill_establish_pmd(&state);
		if (err)
			break;

		/*
		 * For shmem mappings, khugepaged is allowed to remove page
		 * tables under us; pte_offset_map_lock() will deal with that.
		 */

		err = mfill_atomic_pte(&state);
		cond_resched();

		if (unlikely(err == -ENOENT)) {
			void *kaddr;

			mfill_put_vma(&state);
			VM_WARN_ON_ONCE(!state.folio);

			kaddr = kmap_local_folio(state.folio, 0);
			err = copy_from_user(kaddr,
					     (const void __user *)state.src_addr,
					     PAGE_SIZE);
			kunmap_local(kaddr);
			if (unlikely(err)) {
				err = -EFAULT;
				goto out;
			}
			flush_dcache_folio(state.folio);
			goto retry;
		} else
			VM_WARN_ON_ONCE(state.folio);

		if (!err) {
			state.dst_addr += PAGE_SIZE;
			state.src_addr += PAGE_SIZE;
			copied += PAGE_SIZE;

			if (fatal_signal_pending(current))
				err = -EINTR;
		}
		if (err)
			break;
	}

Re: [PATCH v3 02/15] userfaultfd: introduce struct mfill_state

Posted by Andrew Morton 4 days, 16 hours ago

On Wed, 1 Apr 2026 10:37:17 -0700 Andrew Morton <akpm@linux-foundation.org> wrote:

> On Wed, 1 Apr 2026 10:36:03 +0300 Mike Rapoport <rppt@kernel.org> wrote:
> 
> > Here's a fixup (it causes a conflict in patch 4 though).
> > Andrew, I can send v4 if you prefer.
> 
> Looks pretty simple.  "userfaultfd: introduce mfill_get_vma() and
> mfill_put_vma()" now leaves things like this:

err, actually, it gets messy.

Can we please have a v4?  I think there are a few other things pending?

I've noted
https://lkml.kernel.org/r/acqrtd7E6UxrX6Ji@casper.infradead.org and
https://lkml.kernel.org/r/202603311743.uZYPu1Gn-lkp@intel.com

Alternatively, I can leave the current series in mm-unstable as-is and
we can add a fixup at end-of-series.  Not ideal, but it's OK.

Re: [PATCH v3 02/15] userfaultfd: introduce struct mfill_state

Posted by Mike Rapoport 4 days, 5 hours ago

On Wed, Apr 01, 2026 at 10:44:49AM -0700, Andrew Morton wrote:
> On Wed, 1 Apr 2026 10:37:17 -0700 Andrew Morton <akpm@linux-foundation.org> wrote:
> 
> > On Wed, 1 Apr 2026 10:36:03 +0300 Mike Rapoport <rppt@kernel.org> wrote:
> > 
> > > Here's a fixup (it causes a conflict in patch 4 though).
> > > Andrew, I can send v4 if you prefer.
> > 
> > Looks pretty simple.  "userfaultfd: introduce mfill_get_vma() and
> > mfill_put_vma()" now leaves things like this:
> 
> err, actually, it gets messy.
> 
> Can we please have a v4?  I think there are a few other things pending?

Sent.
 
> I've noted
> https://lkml.kernel.org/r/202603311743.uZYPu1Gn-lkp@intel.com

Alice fixed this on the Rust side and the fix is queued to char-misc-linus:

https://lore.kernel.org/all/CAH5fLgg63SYHQ34yqy7o2cA5f2Gh0aNTwTeQkzbWb+kV4Rr3Qw@mail.gmail.com

> https://lkml.kernel.org/r/acqrtd7E6UxrX6Ji@casper.infradead.org

We've been through this since Peter's first posting and I think there's a
consensus about necessity of vm_uffd_ops and the argument was about what
exact API they should have.
 
-- 
Sincerely yours,
Mike.