From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
mfill_atomic() passes a lot of parameters down to its callees.
Aggregate them all into mfill_state structure and pass this structure to
functions that implement various UFFDIO_ commands.
Tracking the state in a structure will allow moving the code that retries
copying of data for UFFDIO_COPY into mfill_atomic_pte_copy() and make the
loop in mfill_atomic() identical for all UFFDIO operations on PTE-mapped
memory.
The mfill_state definition is deliberately local to mm/userfaultfd.c,
hence shmem_mfill_atomic_pte() is not updated.
[harry.yoo@oracle.com: properly initialize mfill_state.len to fix
folio_add_new_anon_rmap() WARN]
Link: https://lkml.kernel.org/r/abehBY7QakYF9bK4@hyeyoo
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
Acked-by: David Hildenbrand (Arm) <david@kernel.org>
---
mm/userfaultfd.c | 148 ++++++++++++++++++++++++++---------------------
1 file changed, 82 insertions(+), 66 deletions(-)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 32637d557c95..fa9622ec7279 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -20,6 +20,20 @@
#include "internal.h"
#include "swap.h"
+struct mfill_state {
+ struct userfaultfd_ctx *ctx;
+ unsigned long src_start;
+ unsigned long dst_start;
+ unsigned long len;
+ uffd_flags_t flags;
+
+ struct vm_area_struct *vma;
+ unsigned long src_addr;
+ unsigned long dst_addr;
+ struct folio *folio;
+ pmd_t *pmd;
+};
+
static __always_inline
bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
{
@@ -272,17 +286,17 @@ static int mfill_copy_folio_locked(struct folio *folio, unsigned long src_addr)
return ret;
}
-static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- uffd_flags_t flags,
- struct folio **foliop)
+static int mfill_atomic_pte_copy(struct mfill_state *state)
{
- int ret;
+ struct vm_area_struct *dst_vma = state->vma;
+ unsigned long dst_addr = state->dst_addr;
+ unsigned long src_addr = state->src_addr;
+ uffd_flags_t flags = state->flags;
+ pmd_t *dst_pmd = state->pmd;
struct folio *folio;
+ int ret;
- if (!*foliop) {
+ if (!state->folio) {
ret = -ENOMEM;
folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma,
dst_addr);
@@ -294,13 +308,13 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
- *foliop = folio;
+ state->folio = folio;
/* don't free the page */
goto out;
}
} else {
- folio = *foliop;
- *foliop = NULL;
+ folio = state->folio;
+ state->folio = NULL;
}
/*
@@ -357,10 +371,11 @@ static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd,
return ret;
}
-static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr)
+static int mfill_atomic_pte_zeropage(struct mfill_state *state)
{
+ struct vm_area_struct *dst_vma = state->vma;
+ unsigned long dst_addr = state->dst_addr;
+ pmd_t *dst_pmd = state->pmd;
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
int ret;
@@ -392,13 +407,14 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
}
/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
-static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- uffd_flags_t flags)
+static int mfill_atomic_pte_continue(struct mfill_state *state)
{
- struct inode *inode = file_inode(dst_vma->vm_file);
+ struct vm_area_struct *dst_vma = state->vma;
+ unsigned long dst_addr = state->dst_addr;
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+ struct inode *inode = file_inode(dst_vma->vm_file);
+ uffd_flags_t flags = state->flags;
+ pmd_t *dst_pmd = state->pmd;
struct folio *folio;
struct page *page;
int ret;
@@ -436,15 +452,15 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
}
/* Handles UFFDIO_POISON for all non-hugetlb VMAs. */
-static int mfill_atomic_pte_poison(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- uffd_flags_t flags)
+static int mfill_atomic_pte_poison(struct mfill_state *state)
{
- int ret;
+ struct vm_area_struct *dst_vma = state->vma;
struct mm_struct *dst_mm = dst_vma->vm_mm;
+ unsigned long dst_addr = state->dst_addr;
+ pmd_t *dst_pmd = state->pmd;
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
+ int ret;
_dst_pte = make_pte_marker(PTE_MARKER_POISONED);
ret = -EAGAIN;
@@ -668,22 +684,20 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
uffd_flags_t flags);
#endif /* CONFIG_HUGETLB_PAGE */
-static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- uffd_flags_t flags,
- struct folio **foliop)
+static __always_inline ssize_t mfill_atomic_pte(struct mfill_state *state)
{
+ struct vm_area_struct *dst_vma = state->vma;
+ unsigned long src_addr = state->src_addr;
+ unsigned long dst_addr = state->dst_addr;
+ struct folio **foliop = &state->folio;
+ uffd_flags_t flags = state->flags;
+ pmd_t *dst_pmd = state->pmd;
ssize_t err;
- if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
- return mfill_atomic_pte_continue(dst_pmd, dst_vma,
- dst_addr, flags);
- } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
- return mfill_atomic_pte_poison(dst_pmd, dst_vma,
- dst_addr, flags);
- }
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
+ return mfill_atomic_pte_continue(state);
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON))
+ return mfill_atomic_pte_poison(state);
/*
* The normal page fault path for a shmem will invoke the
@@ -697,12 +711,9 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
*/
if (!(dst_vma->vm_flags & VM_SHARED)) {
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY))
- err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
- dst_addr, src_addr,
- flags, foliop);
+ err = mfill_atomic_pte_copy(state);
else
- err = mfill_atomic_pte_zeropage(dst_pmd,
- dst_vma, dst_addr);
+ err = mfill_atomic_pte_zeropage(state);
} else {
err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
dst_addr, src_addr,
@@ -718,13 +729,20 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
unsigned long len,
uffd_flags_t flags)
{
+ struct mfill_state state = (struct mfill_state){
+ .ctx = ctx,
+ .dst_start = dst_start,
+ .src_start = src_start,
+ .flags = flags,
+ .len = len,
+ .src_addr = src_start,
+ .dst_addr = dst_start,
+ };
struct mm_struct *dst_mm = ctx->mm;
struct vm_area_struct *dst_vma;
+ long copied = 0;
ssize_t err;
pmd_t *dst_pmd;
- unsigned long src_addr, dst_addr;
- long copied;
- struct folio *folio;
/*
* Sanitize the command parameters:
@@ -736,10 +754,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
VM_WARN_ON_ONCE(src_start + len <= src_start);
VM_WARN_ON_ONCE(dst_start + len <= dst_start);
- src_addr = src_start;
- dst_addr = dst_start;
- copied = 0;
- folio = NULL;
retry:
/*
* Make sure the vma is not shared, that the dst range is
@@ -790,12 +804,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
goto out_unlock;
- while (src_addr < src_start + len) {
- pmd_t dst_pmdval;
+ state.vma = dst_vma;
- VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
+ while (state.src_addr < src_start + len) {
+ VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);
+
+ pmd_t dst_pmdval;
- dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
+ dst_pmd = mm_alloc_pmd(dst_mm, state.dst_addr);
if (unlikely(!dst_pmd)) {
err = -ENOMEM;
break;
@@ -827,34 +843,34 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
* tables under us; pte_offset_map_lock() will deal with that.
*/
- err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
- src_addr, flags, &folio);
+ state.pmd = dst_pmd;
+ err = mfill_atomic_pte(&state);
cond_resched();
if (unlikely(err == -ENOENT)) {
void *kaddr;
up_read(&ctx->map_changing_lock);
- uffd_mfill_unlock(dst_vma);
- VM_WARN_ON_ONCE(!folio);
+ uffd_mfill_unlock(state.vma);
+ VM_WARN_ON_ONCE(!state.folio);
- kaddr = kmap_local_folio(folio, 0);
+ kaddr = kmap_local_folio(state.folio, 0);
err = copy_from_user(kaddr,
- (const void __user *) src_addr,
+ (const void __user *)state.src_addr,
PAGE_SIZE);
kunmap_local(kaddr);
if (unlikely(err)) {
err = -EFAULT;
goto out;
}
- flush_dcache_folio(folio);
+ flush_dcache_folio(state.folio);
goto retry;
} else
- VM_WARN_ON_ONCE(folio);
+ VM_WARN_ON_ONCE(state.folio);
if (!err) {
- dst_addr += PAGE_SIZE;
- src_addr += PAGE_SIZE;
+ state.dst_addr += PAGE_SIZE;
+ state.src_addr += PAGE_SIZE;
copied += PAGE_SIZE;
if (fatal_signal_pending(current))
@@ -866,10 +882,10 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
out_unlock:
up_read(&ctx->map_changing_lock);
- uffd_mfill_unlock(dst_vma);
+ uffd_mfill_unlock(state.vma);
out:
- if (folio)
- folio_put(folio);
+ if (state.folio)
+ folio_put(state.folio);
VM_WARN_ON_ONCE(copied < 0);
VM_WARN_ON_ONCE(err > 0);
VM_WARN_ON_ONCE(!copied && !err);
--
2.53.0
On Mon, Mar 30, 2026 at 01:11:03PM +0300, Mike Rapoport wrote:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>
> mfill_atomic() passes a lot of parameters down to its callees.
>
> Aggregate them all into mfill_state structure and pass this structure to
> functions that implement various UFFDIO_ commands.
>
> Tracking the state in a structure will allow moving the code that retries
> copying of data for UFFDIO_COPY into mfill_atomic_pte_copy() and make the
> loop in mfill_atomic() identical for all UFFDIO operations on PTE-mapped
> memory.
>
> The mfill_state definition is deliberately local to mm/userfaultfd.c,
> hence shmem_mfill_atomic_pte() is not updated.
>
> [harry.yoo@oracle.com: properly initialize mfill_state.len to fix
> folio_add_new_anon_rmap() WARN]
> Link: https://lkml.kernel.org/r/abehBY7QakYF9bK4@hyeyoo
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
> Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> ---
> mm/userfaultfd.c | 148 ++++++++++++++++++++++++++---------------------
> 1 file changed, 82 insertions(+), 66 deletions(-)
>
> @@ -790,12 +804,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
> goto out_unlock;
>
> - while (src_addr < src_start + len) {
> - pmd_t dst_pmdval;
> + state.vma = dst_vma;
Oh wait, the lock leak was introduced in patch 2.
If there's an error between uffd_mfill_lock() and `state.vma = dst_vma`,
it remains unlocked.
Probably should have been fixed in 2, not patch 4...
Sorry didn't realize it earlier.
> - VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
> + while (state.src_addr < src_start + len) {
> + VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);
> +
> + pmd_t dst_pmdval;
>
> - dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
> + dst_pmd = mm_alloc_pmd(dst_mm, state.dst_addr);
> if (unlikely(!dst_pmd)) {
> err = -ENOMEM;
> break;
> @@ -866,10 +882,10 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
>
> out_unlock:
> up_read(&ctx->map_changing_lock);
> - uffd_mfill_unlock(dst_vma);
> + uffd_mfill_unlock(state.vma);
> out:
> - if (folio)
> - folio_put(folio);
> + if (state.folio)
> + folio_put(state.folio);
Sashiko raised a concern [2] that it the VMA might be unmapped and
a new mapping created as a uffd hugetlb vma and leak the folio by
going through
`if (is_vm_hugetlb_page(dst_vma))
return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
src_start, len, flags);`
but it appears to be a false positive (to me) because
`if (atomic_read(&ctx->mmap_changing))` check should have detected unmapping
and free the folio?
[2] https://sashiko.dev/#/patchset/20260330101116.1117699-1-rppt%40kernel.org?patch=13671
> VM_WARN_ON_ONCE(copied < 0);
> VM_WARN_ON_ONCE(err > 0);
> VM_WARN_ON_ONCE(!copied && !err);
Otherwise looks correct to me.
--
Cheers,
Harry / Hyeonggon
Hi Harry,
On Tue, Mar 31, 2026 at 04:03:13PM +0900, Harry Yoo (Oracle) wrote:
> On Mon, Mar 30, 2026 at 01:11:03PM +0300, Mike Rapoport wrote:
> > From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> >
> > mfill_atomic() passes a lot of parameters down to its callees.
> >
> > Aggregate them all into mfill_state structure and pass this structure to
> > functions that implement various UFFDIO_ commands.
> >
> > Tracking the state in a structure will allow moving the code that retries
> > copying of data for UFFDIO_COPY into mfill_atomic_pte_copy() and make the
> > loop in mfill_atomic() identical for all UFFDIO operations on PTE-mapped
> > memory.
> >
> > The mfill_state definition is deliberately local to mm/userfaultfd.c,
> > hence shmem_mfill_atomic_pte() is not updated.
> >
> > [harry.yoo@oracle.com: properly initialize mfill_state.len to fix
> > folio_add_new_anon_rmap() WARN]
> > Link: https://lkml.kernel.org/r/abehBY7QakYF9bK4@hyeyoo
> > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> > Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
> > Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> > ---
> > mm/userfaultfd.c | 148 ++++++++++++++++++++++++++---------------------
> > 1 file changed, 82 insertions(+), 66 deletions(-)
> >
> > @@ -790,12 +804,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
> > goto out_unlock;
> >
> > - while (src_addr < src_start + len) {
> > - pmd_t dst_pmdval;
> > + state.vma = dst_vma;
>
> Oh wait, the lock leak was introduced in patch 2.
Lock leak was introduced in patch 4 that moved getting the vma.
Patch 2 missed the assignment of state.len and introduced an issue with
bound checks.
> If there's an error between uffd_mfill_lock() and `state.vma = dst_vma`,
> it remains unlocked.
>
> Probably should have been fixed in 2, not patch 4...
> Sorry didn't realize it earlier.
>
> > - VM_WARN_ON_ONCE(dst_addr >= dst_start + len);
> > + while (state.src_addr < src_start + len) {
> > + VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);
> > +
> > + pmd_t dst_pmdval;
> >
> > - dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
> > + dst_pmd = mm_alloc_pmd(dst_mm, state.dst_addr);
> > if (unlikely(!dst_pmd)) {
> > err = -ENOMEM;
> > break;
> > @@ -866,10 +882,10 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> >
> > out_unlock:
> > up_read(&ctx->map_changing_lock);
> > - uffd_mfill_unlock(dst_vma);
> > + uffd_mfill_unlock(state.vma);
> > out:
> > - if (folio)
> > - folio_put(folio);
> > + if (state.folio)
> > + folio_put(state.folio);
>
> Sashiko raised a concern [2] that it the VMA might be unmapped and
> a new mapping created as a uffd hugetlb vma and leak the folio by
> going through
>
> `if (is_vm_hugetlb_page(dst_vma))
> return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> src_start, len, flags);`
>
> but it appears to be a false positive (to me) because
>
> `if (atomic_read(&ctx->mmap_changing))` check should have detected unmapping
> and free the folio?
I think it's real, and it's there more or less from the beginning, although
nobody hit it yet :)
Before retrying the copy we drop all the locks, so if the copy is really
long the old mapping can be wiped and a new mapping can be created instead.
There's already a v4 of a patch that attempts to solve this:
https://lore.kernel.org/all/20260331134158.622084-1-devnexen@gmail.com
> [2] https://sashiko.dev/#/patchset/20260330101116.1117699-1-rppt%40kernel.org?patch=13671
>
> > VM_WARN_ON_ONCE(copied < 0);
> > VM_WARN_ON_ONCE(err > 0);
> > VM_WARN_ON_ONCE(!copied && !err);
>
> Otherwise looks correct to me.
>
> --
> Cheers,
> Harry / Hyeonggon
--
Sincerely yours,
Mike.
On Tue, Mar 31, 2026 at 05:32:28PM +0300, Mike Rapoport wrote:
> Hi Harry,
>
> On Tue, Mar 31, 2026 at 04:03:13PM +0900, Harry Yoo (Oracle) wrote:
> > On Mon, Mar 30, 2026 at 01:11:03PM +0300, Mike Rapoport wrote:
> > > From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
> > >
> > > mfill_atomic() passes a lot of parameters down to its callees.
> > >
> > > Aggregate them all into mfill_state structure and pass this structure to
> > > functions that implement various UFFDIO_ commands.
> > >
> > > Tracking the state in a structure will allow moving the code that retries
> > > copying of data for UFFDIO_COPY into mfill_atomic_pte_copy() and make the
> > > loop in mfill_atomic() identical for all UFFDIO operations on PTE-mapped
> > > memory.
> > >
> > > The mfill_state definition is deliberately local to mm/userfaultfd.c,
> > > hence shmem_mfill_atomic_pte() is not updated.
> > >
> > > [harry.yoo@oracle.com: properly initialize mfill_state.len to fix
> > > folio_add_new_anon_rmap() WARN]
> > > Link: https://lkml.kernel.org/r/abehBY7QakYF9bK4@hyeyoo
> > > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> > > Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
> > > Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> > > ---
> > > mm/userfaultfd.c | 148 ++++++++++++++++++++++++++---------------------
> > > 1 file changed, 82 insertions(+), 66 deletions(-)
> > >
> > > @@ -790,12 +804,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > > uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
> > > goto out_unlock;
> > >
> > > - while (src_addr < src_start + len) {
> > > - pmd_t dst_pmdval;
> > > + state.vma = dst_vma;
> >
> > Oh wait, the lock leak was introduced in patch 2.
>
> Lock leak was introduced in patch 4 that moved getting the vma.
Still not sure what I could possibly be missing, so let me try again.
when I check out to this commit "userfaultfd: introduce struct mfill_state" I see:
| static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
| unsigned long dst_start,
| unsigned long src_start,
| unsigned long len,
| uffd_flags_t flags)
| {
| struct mfill_state state = (struct mfill_state){
| .ctx = ctx,
| .dst_start = dst_start,
| .src_start = src_start, .flags = flags,
| .len = len,
| .src_addr = src_start,
| .dst_addr = dst_start,
| };
|
[ ...snip...]
| retry:
| /*
| * Make sure the vma is not shared, that the dst range is
| * both valid and fully within a single existing vma.
| */
| dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
It acquires the vma lock (or mmap_lock) here, but doesn't set state.vma.
| if (IS_ERR(dst_vma)) {
| err = PTR_ERR(dst_vma);
| goto out;
| }
|
| /*
| * If memory mappings are changing because of non-cooperative
| * operation (e.g. mremap) running in parallel, bail out and
| * request the user to retry later
| */
| down_read(&ctx->map_changing_lock);
| err = -EAGAIN;
| if (atomic_read(&ctx->mmap_changing))
| goto out_unlock;
|
| err = -EINVAL;
| /*
| * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
| * it will overwrite vm_ops, so vma_is_anonymous must return false.
| */
| if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
| dst_vma->vm_flags & VM_SHARED))
|
| /*
| * validate 'mode' now that we know the dst_vma: don't allow
| * a wrprotect copy if the userfaultfd didn't register as WP.
| */
| if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
| goto out_unlock;
|
| /*
| * If this is a HUGETLB vma, pass off to appropriate routine
| */
| if (is_vm_hugetlb_page(dst_vma))
| return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
| src_start, len, flags);
|
| if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
| goto out_unlock;
| if (!vma_is_shmem(dst_vma) &&
| uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
| goto out_unlock;
|
| state.vma = dst_vma;
It is set here. So if anything before this jumps to `out_unlock`
label due to a sanity check,
[...]
| while (state.src_addr < src_start + len) {
| VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);
|
| pmd_t dst_pmdval;
| [...]
|
| out_unlock:
| up_read(&ctx->map_changing_lock);
| uffd_mfill_unlock(state.vma);
the `vma` parameter will be NULL?
If I'm not missing something this is introduced in patch 2 and
fixed in patch 4.
| out:
| if (state.folio)
| folio_put(state.folio);
| VM_WARN_ON_ONCE(copied < 0);
| VM_WARN_ON_ONCE(err > 0);
| VM_WARN_ON_ONCE(!copied && !err);
| return copied ? copied : err;
| }
> > > @@ -866,10 +882,10 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
> > >
> > > out_unlock:
> > > up_read(&ctx->map_changing_lock);
> > > - uffd_mfill_unlock(dst_vma);
> > > + uffd_mfill_unlock(state.vma);
> > > out:
> > > - if (folio)
> > > - folio_put(folio);
> > > + if (state.folio)
> > > + folio_put(state.folio);
> >
> > Sashiko raised a concern [2] that it the VMA might be unmapped and
> > a new mapping created as a uffd hugetlb vma and leak the folio by
> > going through
> >
> > `if (is_vm_hugetlb_page(dst_vma))
> > return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
> > src_start, len, flags);`
> >
> > but it appears to be a false positive (to me) because
> >
> > `if (atomic_read(&ctx->mmap_changing))` check should have detected unmapping
> > and free the folio?
>
> I think it's real, and it's there more or less from the beginning, although
> nobody hit it yet :)
>
> Before retrying the copy we drop all the locks, so if the copy is really
> long the old mapping can be wiped and a new mapping can be created instead.
Oops, perhaps I should have imagined harder :)
> There's already a v4 of a patch that attempts to solve this:
>
> https://lore.kernel.org/all/20260331134158.622084-1-devnexen@gmail.com
Thanks for the pointer!
> > [2] https://sashiko.dev/#/patchset/20260330101116.1117699-1-rppt%40kernel.org?patch=13671
> >
> > > VM_WARN_ON_ONCE(copied < 0);
> > > VM_WARN_ON_ONCE(err > 0);
> > > VM_WARN_ON_ONCE(!copied && !err);
--
Cheers,
Harry / Hyeonggon
On Wed, Apr 01, 2026 at 12:24:01AM +0900, Harry Yoo (Oracle) wrote:
> On Tue, Mar 31, 2026 at 05:32:28PM +0300, Mike Rapoport wrote:
> | /*
> | * Make sure the vma is not shared, that the dst range is
> | * both valid and fully within a single existing vma.
> | */
> | dst_vma = uffd_mfill_lock(dst_mm, dst_start, len);
>
> It acquires the vma lock (or mmap_lock) here, but doesn't set state.vma.
>
> | if (IS_ERR(dst_vma)) {
> | err = PTR_ERR(dst_vma);
> | goto out;
> | }
...
> | if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> | goto out_unlock;
> | if (!vma_is_shmem(dst_vma) &&
> | uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
> | goto out_unlock;
> |
> | state.vma = dst_vma;
>
> It is set here. So if anything before this jumps to `out_unlock`
> label due to a sanity check,
>
> [...]
>
> | while (state.src_addr < src_start + len) {
> | VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);
> |
> | pmd_t dst_pmdval;
> | [...]
> |
> | out_unlock:
> | up_read(&ctx->map_changing_lock);
> | uffd_mfill_unlock(state.vma);
>
> the `vma` parameter will be NULL?
>
> If I'm not missing something this is introduced in patch 2 and
> fixed in patch 4.
You are right.
Here's a fixup (it causes a conflict in patch 4 though).
Andrew, I can send v4 if you prefer.
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index fa9622ec7279..c4074b6f4aca 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -764,6 +764,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
err = PTR_ERR(dst_vma);
goto out;
}
+ state.vma = dst_vma;
/*
* If memory mappings are changing because of non-cooperative
@@ -804,8 +805,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
goto out_unlock;
- state.vma = dst_vma;
-
while (state.src_addr < src_start + len) {
VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);
--
Sincerely yours,
Mike.
On Wed, 1 Apr 2026 10:36:03 +0300 Mike Rapoport <rppt@kernel.org> wrote:
> Here's a fixup (it causes a conflict in patch 4 though).
> Andrew, I can send v4 if you prefer.
Looks pretty simple. "userfaultfd: introduce mfill_get_vma() and
mfill_put_vma()" now leaves things like this:
retry:
err = mfill_get_vma(&state);
if (err)
goto out;
state.vma = dst_vma;
/*
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(state.vma))
return mfill_atomic_hugetlb(ctx, state.vma, dst_start,
src_start, len, flags);
while (state.src_addr < src_start + len) {
VM_WARN_ON_ONCE(state.dst_addr >= dst_start + len);
err = mfill_establish_pmd(&state);
if (err)
break;
/*
* For shmem mappings, khugepaged is allowed to remove page
* tables under us; pte_offset_map_lock() will deal with that.
*/
err = mfill_atomic_pte(&state);
cond_resched();
if (unlikely(err == -ENOENT)) {
void *kaddr;
mfill_put_vma(&state);
VM_WARN_ON_ONCE(!state.folio);
kaddr = kmap_local_folio(state.folio, 0);
err = copy_from_user(kaddr,
(const void __user *)state.src_addr,
PAGE_SIZE);
kunmap_local(kaddr);
if (unlikely(err)) {
err = -EFAULT;
goto out;
}
flush_dcache_folio(state.folio);
goto retry;
} else
VM_WARN_ON_ONCE(state.folio);
if (!err) {
state.dst_addr += PAGE_SIZE;
state.src_addr += PAGE_SIZE;
copied += PAGE_SIZE;
if (fatal_signal_pending(current))
err = -EINTR;
}
if (err)
break;
}
On Wed, 1 Apr 2026 10:37:17 -0700 Andrew Morton <akpm@linux-foundation.org> wrote: > On Wed, 1 Apr 2026 10:36:03 +0300 Mike Rapoport <rppt@kernel.org> wrote: > > > Here's a fixup (it causes a conflict in patch 4 though). > > Andrew, I can send v4 if you prefer. > > Looks pretty simple. "userfaultfd: introduce mfill_get_vma() and > mfill_put_vma()" now leaves things like this: err, actually, it gets messy. Can we please have a v4? I think there are a few other things pending? I've noted https://lkml.kernel.org/r/acqrtd7E6UxrX6Ji@casper.infradead.org and https://lkml.kernel.org/r/202603311743.uZYPu1Gn-lkp@intel.com Alternatively, I can leave the current series in mm-unstable as-is and we can add a fixup at end-of-series. Not ideal, but it's OK.
On Wed, Apr 01, 2026 at 10:44:49AM -0700, Andrew Morton wrote: > On Wed, 1 Apr 2026 10:37:17 -0700 Andrew Morton <akpm@linux-foundation.org> wrote: > > > On Wed, 1 Apr 2026 10:36:03 +0300 Mike Rapoport <rppt@kernel.org> wrote: > > > > > Here's a fixup (it causes a conflict in patch 4 though). > > > Andrew, I can send v4 if you prefer. > > > > Looks pretty simple. "userfaultfd: introduce mfill_get_vma() and > > mfill_put_vma()" now leaves things like this: > > err, actually, it gets messy. > > Can we please have a v4? I think there are a few other things pending? Sent. > I've noted > https://lkml.kernel.org/r/202603311743.uZYPu1Gn-lkp@intel.com Alice fixed this on the Rust side and the fix is queued to char-misc-linus: https://lore.kernel.org/all/CAH5fLgg63SYHQ34yqy7o2cA5f2Gh0aNTwTeQkzbWb+kV4Rr3Qw@mail.gmail.com > https://lkml.kernel.org/r/acqrtd7E6UxrX6Ji@casper.infradead.org We've been through this since Peter's first posting and I think there's a consensus about necessity of vm_uffd_ops and the argument was about what exact API they should have. -- Sincerely yours, Mike.
© 2016 - 2026 Red Hat, Inc.