From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0A0D036CDEC; Tue, 27 Jan 2026 19:29:53 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542194; cv=none; b=bQrlHwzaLAgznD3apDxfTQpOEJfdfOEscFxVprHJsaVCbLfqm3KEZ0M79r5XGm8RsH8I84ulA4oCF6MsWRUjexqPM4G3cQPbKAVxL8RX4RLBpPd62NdNTRtEzw8PlE1RJ8DhBYCzGahxSZ4wgMqwc+CNmVh44W7Y4brOa6yRar4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542194; c=relaxed/simple; bh=aI4WO62kph2VvKe2gJ7QoMhXLXNy49H9C+58Y9P8ebA=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=WvVRdeaox9scboCyy5t7aqpE9t6uNAgUwq7ZxyIX9O50FDbnwreznFwf64Tb6SbPNvW68wKjrja3pmDwFHStPMGOJU6IQpdPxUC+BI/w9CLtNmNlY7/oQMhKSlnAcdXoaD1w8pGHNuwIQ3z1K7CjEJaK22dnU/IJ1nebjQPHbzE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=uzu2dY0F; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="uzu2dY0F" Received: by smtp.kernel.org (Postfix) with ESMTPSA id D53D3C19425; Tue, 27 Jan 2026 19:29:47 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542193; bh=aI4WO62kph2VvKe2gJ7QoMhXLXNy49H9C+58Y9P8ebA=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=uzu2dY0FD8NP7vGfp5IudjeX8x6QiJOSfV1oxxuPXDUvu4rBKpUTNA49h5T9Wohpy pfEABMoxK6o/yC7roNa0yXjcHxBqwtIkm+UYpNDDQd1qRNy4ch60sgcDSqaVtG27wK K+UqBFc5dhddJewp0K/vzuLPF2PxpQi8HyATmV0Xi9XEzGTqVFNmA9w4LSuWcFXoVg QrYaAotmrdV0CDUshJOjvcVA2VrJFYEjysfOFyw5eFeR2wtvfX00eoewjFdNTtWcua +mBnHhBZ1o53N7kxZ4CnCivm92wleRp8YgLjcWxCGw1GfNjWI6wCViPGzu/VwKzk7f /HCrBMvTm8PjQ== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH RFC 01/17] userfaultfd: introduce mfill_copy_folio_locked() helper Date: Tue, 27 Jan 2026 21:29:20 +0200 Message-ID: <20260127192936.1250096-2-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Mike Rapoport (Microsoft)" Split copying of data when locks held from mfill_atomic_pte_copy() into a helper function mfill_copy_folio_locked(). This makes improves code readability and makes complex mfill_atomic_pte_copy() function easier to comprehend. No functional change. Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Peter Xu --- mm/userfaultfd.c | 59 ++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e6dfd5f28acd..a0885d543f22 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -238,6 +238,40 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, return ret; } =20 +static int mfill_copy_folio_locked(struct folio *folio, unsigned long src_= addr) +{ + void *kaddr; + int ret; + + kaddr =3D kmap_local_folio(folio, 0); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); + ret =3D copy_from_user(kaddr, (const void __user *) src_addr, + PAGE_SIZE); + pagefault_enable(); + kunmap_local(kaddr); + + if (ret) + return -EFAULT; + + flush_dcache_folio(folio); + return ret; +} + static int mfill_atomic_pte_copy(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, unsigned long dst_addr, @@ -245,7 +279,6 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, uffd_flags_t flags, struct folio **foliop) { - void *kaddr; int ret; struct folio *folio; =20 @@ -256,27 +289,7 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, if (!folio) goto out; =20 - kaddr =3D kmap_local_folio(folio, 0); - /* - * The read mmap_lock is held here. Despite the - * mmap_lock being read recursive a deadlock is still - * possible if a writer has taken a lock. For example: - * - * process A thread 1 takes read lock on own mmap_lock - * process A thread 2 calls mmap, blocks taking write lock - * process B thread 1 takes page fault, read lock on own mmap lock - * process B thread 2 calls mmap, blocks taking write lock - * process A thread 1 blocks taking read lock on process B - * process B thread 1 blocks taking read lock on process A - * - * Disable page faults to prevent potential deadlock - * and retry the copy outside the mmap_lock. - */ - pagefault_disable(); - ret =3D copy_from_user(kaddr, (const void __user *) src_addr, - PAGE_SIZE); - pagefault_enable(); - kunmap_local(kaddr); + ret =3D mfill_copy_folio_locked(folio, src_addr); =20 /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { @@ -285,8 +298,6 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, /* don't free the page */ goto out; } - - flush_dcache_folio(folio); } else { folio =3D *foliop; *foliop =3D NULL; --=20 2.51.0 From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4AA3D326949; Tue, 27 Jan 2026 19:29:59 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542200; cv=none; b=ludgT81kIhG8HnrgpLG1FGQiqHgSB9bSglIYLWsow0PoT/rY+LZ/THv0P46GrPMXvz5wC1bvZh8iNbs9qI1V4Qo6IpBIC8FlZF+BjrZUWkmBSmbc+Bs6jXElxkzeA2LYtdXRYOaiKmifWdzxftfyTH/nS3Q7yUZ+05GWbsbo7l8= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542200; c=relaxed/simple; bh=QA0/2G/K3ZBW3GhC7A4tfiRTq7LYOOc+8j2kW+8FnYs=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=f4F2y7+OoPduVlQaEb+Ya1lPU+MA5M2EpGXB/nQ1vVcqsQn+6C/ayBEIiGpLajtWPgWxio33ufW9yrzgg4hE+FkCVLVgrYeFz/V5zbwPhYIO2APN5R7i5tgSPTH2p4INaIttRmwNIw1IPC9ropuTvDvD7Ft60RkZ3IXnV51qIH4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=ooGptc46; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="ooGptc46" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 2044FC116C6; Tue, 27 Jan 2026 19:29:53 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542199; bh=QA0/2G/K3ZBW3GhC7A4tfiRTq7LYOOc+8j2kW+8FnYs=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=ooGptc46J+V4zaOBgMR5V5oW964P8jwsjhTyh26ptaR2Uzz5YCZvZDI+tiwrm+9vb I4FTzO1f/yKV7oi88yV2Ka4LLX0tpS86XWLDFcUeDQ4+5bydO1zCjl9kDLIUStScmu o/65K1e7p8pWAANkdpveXk8qxmqQwxNwpWKlFQ5c4jOqeCyRgcAJ9SwYHS7xaXUEmB X4mgaXSSxKTSU5rgo1wlPGZXwHzQ4FpE3meM3saJUpUNzdzewdmyKKhRpqUSa+vkVz pyHQ8USM1zM3U+R10dlg22Xfm60B5O7bJPS3snC9o9cjUxKiudhrjwgHT/FfrkR8n6 +8ZDzXtdfihtg== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH RFC 02/17] userfaultfd: introduce struct mfill_state Date: Tue, 27 Jan 2026 21:29:21 +0200 Message-ID: <20260127192936.1250096-3-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Mike Rapoport (Microsoft)" mfill_atomic() passes a lot of parameters down to its callees. Aggregate them all into mfill_state structure and pass this structure to functions that implement various UFFDIO_ commands. Tracking the state in a structure will allow moving the code that retries copying of data for UFFDIO_COPY into mfill_atomic_pte_copy() and make the loop in mfill_atomic() identical for all UFFDIO operations on PTE-mapped memory. The mfill_state definition is deliberately local to mm/userfaultfd.c, hence shmem_mfill_atomic_pte() is not updated. Signed-off-by: Mike Rapoport (Microsoft) --- mm/userfaultfd.c | 148 ++++++++++++++++++++++++++--------------------- 1 file changed, 82 insertions(+), 66 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index a0885d543f22..6a0697c93ff4 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -20,6 +20,20 @@ #include "internal.h" #include "swap.h" =20 +struct mfill_state { + struct userfaultfd_ctx *ctx; + unsigned long src_start; + unsigned long dst_start; + unsigned long len; + uffd_flags_t flags; + + struct vm_area_struct *vma; + unsigned long src_addr; + unsigned long dst_addr; + struct folio *folio; + pmd_t *pmd; +}; + static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_en= d) { @@ -272,17 +286,17 @@ static int mfill_copy_folio_locked(struct folio *foli= o, unsigned long src_addr) return ret; } =20 -static int mfill_atomic_pte_copy(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop) +static int mfill_atomic_pte_copy(struct mfill_state *state) { - int ret; + struct vm_area_struct *dst_vma =3D state->vma; + unsigned long dst_addr =3D state->dst_addr; + unsigned long src_addr =3D state->src_addr; + uffd_flags_t flags =3D state->flags; + pmd_t *dst_pmd =3D state->pmd; struct folio *folio; + int ret; =20 - if (!*foliop) { + if (!state->folio) { ret =3D -ENOMEM; folio =3D vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, dst_addr); @@ -294,13 +308,13 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { ret =3D -ENOENT; - *foliop =3D folio; + state->folio =3D folio; /* don't free the page */ goto out; } } else { - folio =3D *foliop; - *foliop =3D NULL; + folio =3D state->folio; + state->folio =3D NULL; } =20 /* @@ -357,10 +371,11 @@ static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_p= md, return ret; } =20 -static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) +static int mfill_atomic_pte_zeropage(struct mfill_state *state) { + struct vm_area_struct *dst_vma =3D state->vma; + unsigned long dst_addr =3D state->dst_addr; + pmd_t *dst_pmd =3D state->pmd; pte_t _dst_pte, *dst_pte; spinlock_t *ptl; int ret; @@ -392,13 +407,14 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, } =20 /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */ -static int mfill_atomic_pte_continue(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - uffd_flags_t flags) +static int mfill_atomic_pte_continue(struct mfill_state *state) { - struct inode *inode =3D file_inode(dst_vma->vm_file); + struct vm_area_struct *dst_vma =3D state->vma; + unsigned long dst_addr =3D state->dst_addr; pgoff_t pgoff =3D linear_page_index(dst_vma, dst_addr); + struct inode *inode =3D file_inode(dst_vma->vm_file); + uffd_flags_t flags =3D state->flags; + pmd_t *dst_pmd =3D state->pmd; struct folio *folio; struct page *page; int ret; @@ -436,15 +452,15 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, } =20 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ -static int mfill_atomic_pte_poison(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - uffd_flags_t flags) +static int mfill_atomic_pte_poison(struct mfill_state *state) { - int ret; + struct vm_area_struct *dst_vma =3D state->vma; struct mm_struct *dst_mm =3D dst_vma->vm_mm; + unsigned long dst_addr =3D state->dst_addr; + pmd_t *dst_pmd =3D state->pmd; pte_t _dst_pte, *dst_pte; spinlock_t *ptl; + int ret; =20 _dst_pte =3D make_pte_marker(PTE_MARKER_POISONED); ret =3D -EAGAIN; @@ -668,22 +684,20 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultf= d_ctx *ctx, uffd_flags_t flags); #endif /* CONFIG_HUGETLB_PAGE */ =20 -static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop) +static __always_inline ssize_t mfill_atomic_pte(struct mfill_state *state) { + struct vm_area_struct *dst_vma =3D state->vma; + unsigned long src_addr =3D state->src_addr; + unsigned long dst_addr =3D state->dst_addr; + struct folio **foliop =3D &state->folio; + uffd_flags_t flags =3D state->flags; + pmd_t *dst_pmd =3D state->pmd; ssize_t err; =20 - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { - return mfill_atomic_pte_continue(dst_pmd, dst_vma, - dst_addr, flags); - } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { - return mfill_atomic_pte_poison(dst_pmd, dst_vma, - dst_addr, flags); - } + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) + return mfill_atomic_pte_continue(state); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) + return mfill_atomic_pte_poison(state); =20 /* * The normal page fault path for a shmem will invoke the @@ -697,12 +711,9 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t = *dst_pmd, */ if (!(dst_vma->vm_flags & VM_SHARED)) { if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) - err =3D mfill_atomic_pte_copy(dst_pmd, dst_vma, - dst_addr, src_addr, - flags, foliop); + err =3D mfill_atomic_pte_copy(state); else - err =3D mfill_atomic_pte_zeropage(dst_pmd, - dst_vma, dst_addr); + err =3D mfill_atomic_pte_zeropage(state); } else { err =3D shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, src_addr, @@ -718,13 +729,20 @@ static __always_inline ssize_t mfill_atomic(struct us= erfaultfd_ctx *ctx, unsigned long len, uffd_flags_t flags) { + struct mfill_state state =3D (struct mfill_state){ + .ctx =3D ctx, + .dst_start =3D dst_start, + .src_start =3D src_start, + .flags =3D flags, + + .src_addr =3D src_start, + .dst_addr =3D dst_start, + }; struct mm_struct *dst_mm =3D ctx->mm; struct vm_area_struct *dst_vma; + long copied =3D 0; ssize_t err; pmd_t *dst_pmd; - unsigned long src_addr, dst_addr; - long copied; - struct folio *folio; =20 /* * Sanitize the command parameters: @@ -736,10 +754,6 @@ static __always_inline ssize_t mfill_atomic(struct use= rfaultfd_ctx *ctx, VM_WARN_ON_ONCE(src_start + len <=3D src_start); VM_WARN_ON_ONCE(dst_start + len <=3D dst_start); =20 - src_addr =3D src_start; - dst_addr =3D dst_start; - copied =3D 0; - folio =3D NULL; retry: /* * Make sure the vma is not shared, that the dst range is @@ -790,12 +804,14 @@ static __always_inline ssize_t mfill_atomic(struct us= erfaultfd_ctx *ctx, uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) goto out_unlock; =20 - while (src_addr < src_start + len) { - pmd_t dst_pmdval; + state.vma =3D dst_vma; =20 - VM_WARN_ON_ONCE(dst_addr >=3D dst_start + len); + while (state.src_addr < src_start + len) { + VM_WARN_ON_ONCE(state.dst_addr >=3D dst_start + len); + + pmd_t dst_pmdval; =20 - dst_pmd =3D mm_alloc_pmd(dst_mm, dst_addr); + dst_pmd =3D mm_alloc_pmd(dst_mm, state.dst_addr); if (unlikely(!dst_pmd)) { err =3D -ENOMEM; break; @@ -827,34 +843,34 @@ static __always_inline ssize_t mfill_atomic(struct us= erfaultfd_ctx *ctx, * tables under us; pte_offset_map_lock() will deal with that. */ =20 - err =3D mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, - src_addr, flags, &folio); + state.pmd =3D dst_pmd; + err =3D mfill_atomic_pte(&state); cond_resched(); =20 if (unlikely(err =3D=3D -ENOENT)) { void *kaddr; =20 up_read(&ctx->map_changing_lock); - uffd_mfill_unlock(dst_vma); - VM_WARN_ON_ONCE(!folio); + uffd_mfill_unlock(state.vma); + VM_WARN_ON_ONCE(!state.folio); =20 - kaddr =3D kmap_local_folio(folio, 0); + kaddr =3D kmap_local_folio(state.folio, 0); err =3D copy_from_user(kaddr, - (const void __user *) src_addr, + (const void __user *)state.src_addr, PAGE_SIZE); kunmap_local(kaddr); if (unlikely(err)) { err =3D -EFAULT; goto out; } - flush_dcache_folio(folio); + flush_dcache_folio(state.folio); goto retry; } else - VM_WARN_ON_ONCE(folio); + VM_WARN_ON_ONCE(state.folio); =20 if (!err) { - dst_addr +=3D PAGE_SIZE; - src_addr +=3D PAGE_SIZE; + state.dst_addr +=3D PAGE_SIZE; + state.src_addr +=3D PAGE_SIZE; copied +=3D PAGE_SIZE; =20 if (fatal_signal_pending(current)) @@ -866,10 +882,10 @@ static __always_inline ssize_t mfill_atomic(struct us= erfaultfd_ctx *ctx, =20 out_unlock: up_read(&ctx->map_changing_lock); - uffd_mfill_unlock(dst_vma); + uffd_mfill_unlock(state.vma); out: - if (folio) - folio_put(folio); + if (state.folio) + folio_put(state.folio); VM_WARN_ON_ONCE(copied < 0); VM_WARN_ON_ONCE(err > 0); VM_WARN_ON_ONCE(!copied && !err); --=20 2.51.0 From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 535453570B3; Tue, 27 Jan 2026 19:30:06 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542206; cv=none; b=Q32vLozitTQKrbaJHYyQVnLUERcIxZlW8/MMe5EEDyRmzYqD8nuiavQTuOgJ3eWXR1QwyFaEqB342u/a+kyCQHrh5q91G4z5Ve61MKP9aMMsQOKXIkR/hpHVEsFtrAmXbMCnpAKPj++aXqmi8xKBFo4eKlinSURvlmPBhFX2LPE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542206; c=relaxed/simple; bh=vrJiGnjrN54H6/hY2yBJ167Kljp428ywa4wfYNGCg7c=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=u3ZgjhaPfR7ZdTSTd4MjbXj3VAivoHVOfk0pfgg36OTyX2vHmsN20R36Rs698rIsrynczR0L1Dnf59PMZnNRin93FSvK0jqrM5K2I6i02b/jci3DtNlkljF691euQGwEhaLPXGPg3Kqm0EeZCUrCm4/BLgGpio2N1DCCwtqZKIk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=ZsAx+lcO; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="ZsAx+lcO" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 5E2B5C19422; Tue, 27 Jan 2026 19:30:00 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542206; bh=vrJiGnjrN54H6/hY2yBJ167Kljp428ywa4wfYNGCg7c=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=ZsAx+lcOfUN+ECjYaQy+/o3roO+K1SedxxG5fNMiknciYUsLwxKjIjhmCgIUs3wNY Y81+OqvNTso0O/aKTQ9Y8I0Gkm/8wJDtbigcgGWHw4VLjiOiUwfKlBrZS9ZK4AKR4f w3tjGLjAXXqe52duuTP9qv/6lh1muhhNvJ2d3oyTZGqyyXYTbZfjQWj+ZVZkUQHjlC 36C4iJ6jhgGm9WiJMNqaoViZgCOZuXilC/3Z0t3ZF15rAoqN0uQxYBP5eEq2w16lmp 5aPfACXz90qz/XONl8LYfFtdB38cmBz0euQcpU84+fQHpFa7ZAeuo75YE4KQzrt1+k wN4w2q+MEFi8g== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH RFC 03/17] userfaultfd: introduce mfill_get_pmd() helper. Date: Tue, 27 Jan 2026 21:29:22 +0200 Message-ID: <20260127192936.1250096-4-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Mike Rapoport (Microsoft)" There is a lengthy code chunk in mfill_atomic() that establishes the PMD for UFFDIO operations. This code may be called twice: first time when the copy is performed with VMA/mm locks held and the other time after the copy is retried with locks dropped. Move the code that establishes a PMD into a helper function so it can be reused later during refactoring of mfill_atomic_pte_copy(). Signed-off-by: Mike Rapoport (Microsoft) --- mm/userfaultfd.c | 103 ++++++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 6a0697c93ff4..9dd285b13f3b 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -157,6 +157,57 @@ static void uffd_mfill_unlock(struct vm_area_struct *v= ma) } #endif =20 +static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd =3D pgd_offset(mm, address); + p4d =3D p4d_alloc(mm, pgd, address); + if (!p4d) + return NULL; + pud =3D pud_alloc(mm, p4d, address); + if (!pud) + return NULL; + /* + * Note that we didn't run this because the pmd was + * missing, the *pmd may be already established and in + * turn it may also be a trans_huge_pmd. + */ + return pmd_alloc(mm, pud, address); +} + +static int mfill_get_pmd(struct mfill_state *state) +{ + struct mm_struct *dst_mm =3D state->ctx->mm; + pmd_t *dst_pmd; + pmd_t dst_pmdval; + + dst_pmd =3D mm_alloc_pmd(dst_mm, state->dst_addr); + if (unlikely(!dst_pmd)) + return -ENOMEM; + + dst_pmdval =3D pmdp_get_lockless(dst_pmd); + if (unlikely(pmd_none(dst_pmdval)) && + unlikely(__pte_alloc(dst_mm, dst_pmd))) + return -ENOMEM; + + dst_pmdval =3D pmdp_get_lockless(dst_pmd); + /* + * If the dst_pmd is THP don't override it and just be strict. + * (This includes the case where the PMD used to be THP and + * changed back to none after __pte_alloc().) + */ + if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval))) + return -EEXIST; + if (unlikely(pmd_bad(dst_pmdval))) + return -EFAULT; + + state->pmd =3D dst_pmd; + return 0; +} + /* Check if dst_addr is outside of file's size. Must be called with ptl he= ld. */ static bool mfill_file_over_size(struct vm_area_struct *dst_vma, unsigned long dst_addr) @@ -489,27 +540,6 @@ static int mfill_atomic_pte_poison(struct mfill_state = *state) return ret; } =20 -static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) -{ - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - - pgd =3D pgd_offset(mm, address); - p4d =3D p4d_alloc(mm, pgd, address); - if (!p4d) - return NULL; - pud =3D pud_alloc(mm, p4d, address); - if (!pud) - return NULL; - /* - * Note that we didn't run this because the pmd was - * missing, the *pmd may be already established and in - * turn it may also be a trans_huge_pmd. - */ - return pmd_alloc(mm, pud, address); -} - #ifdef CONFIG_HUGETLB_PAGE /* * mfill_atomic processing for HUGETLB vmas. Note that this routine is @@ -742,7 +772,6 @@ static __always_inline ssize_t mfill_atomic(struct user= faultfd_ctx *ctx, struct vm_area_struct *dst_vma; long copied =3D 0; ssize_t err; - pmd_t *dst_pmd; =20 /* * Sanitize the command parameters: @@ -809,41 +838,15 @@ static __always_inline ssize_t mfill_atomic(struct us= erfaultfd_ctx *ctx, while (state.src_addr < src_start + len) { VM_WARN_ON_ONCE(state.dst_addr >=3D dst_start + len); =20 - pmd_t dst_pmdval; - - dst_pmd =3D mm_alloc_pmd(dst_mm, state.dst_addr); - if (unlikely(!dst_pmd)) { - err =3D -ENOMEM; + err =3D mfill_get_pmd(&state); + if (err) break; - } =20 - dst_pmdval =3D pmdp_get_lockless(dst_pmd); - if (unlikely(pmd_none(dst_pmdval)) && - unlikely(__pte_alloc(dst_mm, dst_pmd))) { - err =3D -ENOMEM; - break; - } - dst_pmdval =3D pmdp_get_lockless(dst_pmd); - /* - * If the dst_pmd is THP don't override it and just be strict. - * (This includes the case where the PMD used to be THP and - * changed back to none after __pte_alloc().) - */ - if (unlikely(!pmd_present(dst_pmdval) || - pmd_trans_huge(dst_pmdval))) { - err =3D -EEXIST; - break; - } - if (unlikely(pmd_bad(dst_pmdval))) { - err =3D -EFAULT; - break; - } /* * For shmem mappings, khugepaged is allowed to remove page * tables under us; pte_offset_map_lock() will deal with that. */ =20 - state.pmd =3D dst_pmd; err =3D mfill_atomic_pte(&state); cond_resched(); =20 --=20 2.51.0 From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C12A436E469; Tue, 27 Jan 2026 19:30:12 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542212; cv=none; b=RLlfRZaDwWmlxPdm5XVtY4qidQYzno4nbt53Bjeu1uz6PG1I/DWRLbzaZW0SjMbO6F3W8iAyCV+kArvA6K9Z5UgUv5xrtXcxRpmIB4oGind3cHCr0Hq0IMcQYuoBdSpi0QFYInCil8sVVnR32X5LtlGGw2Dm/Q+/YfUznHJLuPw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542212; c=relaxed/simple; bh=lY9tdKAS4lHCu99XKdeK7IKZ185P9DUlwH987ofAD1o=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=SGI3xyGdU/msKQ2T9lzvuSivLOWgvQ5PqE8ggtvXGXp1n6lHvwKijBsE9gV0+nllRMlUxFX/niPK8K5792xEm7z9X7vE5cO3W+PYdONGYqj9rkFlbI1WvmIdL8QdjCN9Sq0NQH0LT941+CDCoHZWwayIgObMna1WCMxQZFeNUl0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=UOrrWt80; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="UOrrWt80" Received: by smtp.kernel.org (Postfix) with ESMTPSA id AA799C116C6; Tue, 27 Jan 2026 19:30:06 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542212; bh=lY9tdKAS4lHCu99XKdeK7IKZ185P9DUlwH987ofAD1o=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=UOrrWt80qHGsBbSId56SPZ2zTVBoHejq4lV9GhdtO0i/h9+oXRhpBpZjjpsHTquxK KxlhPjyDP1gxoJM8eetOG2X0B5fumAhom14BLpMPVC5J5x4AJBfZn+HfJ+xSiixW5Y FHTdUyunbnOsgaZOPVW1INzWOS2dA2eOrqvPsgk/NBXPpLjxdDJGfWHSnHhA8zgCXS JnAo32lUYG584W4yo/cBR/QgTdNFV3Uho+5BTiwLhpx0d0zByhvHzhSlVatYmx/jXf Q7keJpSBv6gyha/q1XD23w2/sBKm7t6gMtII9/leccGU4WxMkajYD8iJbytYFiIiWx v4qTH/OX9uIFQ== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH RFC 04/17] userfaultfd: introduce mfill_get_vma() and mfill_put_vma() Date: Tue, 27 Jan 2026 21:29:23 +0200 Message-ID: <20260127192936.1250096-5-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Mike Rapoport (Microsoft)" Split the code that finds, locks and verifies VMA from mfill_atomic() into a helper function. This function will be used later during refactoring of mfill_atomic_pte_copy(). Add a counterpart mfill_put_vma() helper that unlocks the VMA and releases map_changing_lock. Signed-off-by: Mike Rapoport (Microsoft) --- mm/userfaultfd.c | 124 ++++++++++++++++++++++++++++------------------- 1 file changed, 73 insertions(+), 51 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9dd285b13f3b..45d8f04aaf4f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -157,6 +157,73 @@ static void uffd_mfill_unlock(struct vm_area_struct *v= ma) } #endif =20 +static void mfill_put_vma(struct mfill_state *state) +{ + up_read(&state->ctx->map_changing_lock); + uffd_mfill_unlock(state->vma); + state->vma =3D NULL; +} + +static int mfill_get_vma(struct mfill_state *state) +{ + struct userfaultfd_ctx *ctx =3D state->ctx; + uffd_flags_t flags =3D state->flags; + struct vm_area_struct *dst_vma; + int err; + + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + dst_vma =3D uffd_mfill_lock(ctx->mm, state->dst_start, state->len); + if (IS_ERR(dst_vma)) + return PTR_ERR(dst_vma); + + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + down_read(&ctx->map_changing_lock); + err =3D -EAGAIN; + if (atomic_read(&ctx->mmap_changing)) + goto out_unlock; + + err =3D -EINVAL; + + /* + * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but + * it will overwrite vm_ops, so vma_is_anonymous must return false. + */ + if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && + dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + + /* + * validate 'mode' now that we know the dst_vma: don't allow + * a wrprotect copy if the userfaultfd didn't register as WP. + */ + if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) + goto out_unlock; + + if (is_vm_hugetlb_page(dst_vma)) + goto out; + + if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) + goto out_unlock; + if (!vma_is_shmem(dst_vma) && + uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) + goto out_unlock; + +out: + state->vma =3D dst_vma; + return 0; + +out_unlock: + mfill_put_vma(state); + return err; +} + static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; @@ -768,8 +835,6 @@ static __always_inline ssize_t mfill_atomic(struct user= faultfd_ctx *ctx, .src_addr =3D src_start, .dst_addr =3D dst_start, }; - struct mm_struct *dst_mm =3D ctx->mm; - struct vm_area_struct *dst_vma; long copied =3D 0; ssize_t err; =20 @@ -784,57 +849,17 @@ static __always_inline ssize_t mfill_atomic(struct us= erfaultfd_ctx *ctx, VM_WARN_ON_ONCE(dst_start + len <=3D dst_start); =20 retry: - /* - * Make sure the vma is not shared, that the dst range is - * both valid and fully within a single existing vma. - */ - dst_vma =3D uffd_mfill_lock(dst_mm, dst_start, len); - if (IS_ERR(dst_vma)) { - err =3D PTR_ERR(dst_vma); + err =3D mfill_get_vma(&state); + if (err) goto out; - } - - /* - * If memory mappings are changing because of non-cooperative - * operation (e.g. mremap) running in parallel, bail out and - * request the user to retry later - */ - down_read(&ctx->map_changing_lock); - err =3D -EAGAIN; - if (atomic_read(&ctx->mmap_changing)) - goto out_unlock; - - err =3D -EINVAL; - /* - * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but - * it will overwrite vm_ops, so vma_is_anonymous must return false. - */ - if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) && - dst_vma->vm_flags & VM_SHARED)) - goto out_unlock; - - /* - * validate 'mode' now that we know the dst_vma: don't allow - * a wrprotect copy if the userfaultfd didn't register as WP. - */ - if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP)) - goto out_unlock; =20 /* * If this is a HUGETLB vma, pass off to appropriate routine */ - if (is_vm_hugetlb_page(dst_vma)) - return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, + if (is_vm_hugetlb_page(state.vma)) + return mfill_atomic_hugetlb(ctx, state.vma, dst_start, src_start, len, flags); =20 - if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) - goto out_unlock; - if (!vma_is_shmem(dst_vma) && - uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) - goto out_unlock; - - state.vma =3D dst_vma; - while (state.src_addr < src_start + len) { VM_WARN_ON_ONCE(state.dst_addr >=3D dst_start + len); =20 @@ -853,8 +878,7 @@ static __always_inline ssize_t mfill_atomic(struct user= faultfd_ctx *ctx, if (unlikely(err =3D=3D -ENOENT)) { void *kaddr; =20 - up_read(&ctx->map_changing_lock); - uffd_mfill_unlock(state.vma); + mfill_put_vma(&state); VM_WARN_ON_ONCE(!state.folio); =20 kaddr =3D kmap_local_folio(state.folio, 0); @@ -883,9 +907,7 @@ static __always_inline ssize_t mfill_atomic(struct user= faultfd_ctx *ctx, break; } =20 -out_unlock: - up_read(&ctx->map_changing_lock); - uffd_mfill_unlock(state.vma); + mfill_put_vma(&state); out: if (state.folio) folio_put(state.folio); --=20 2.51.0 From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1DA3136E469; Tue, 27 Jan 2026 19:30:18 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542219; cv=none; b=hsqo6/u1prmYWSchy5D2AwV06bMP07RYrzOOc3OSy+a5y5w/FkzVl0Fell413fFEVp8T8WsltUglQwxhhNk26ncUl994OlEjhecpjFgDLbGKCsPBtHvpJBVqesSSv3fTMUJNTX32jYHZVov9zE38vAz/BpVwQpfVbm/Gim1SSn4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542219; c=relaxed/simple; bh=0i1d2fpNNGr7J2FXKMkhDUOQSLx8UrCEb1MMN67S4zY=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=B/1gPyhNg6nf9y5qOTbLbjIAF72WhUqcnO26UFgjxgF5uQzBBexRYFcvjMUtBSoWtupajV2KmqlYp++iXfjcrCrQd9AfxFVguZrq7CgwZ8ry+YP+5QtjW8VaU6mDZC7343o3IwR8TgyIHT1Ml9lUFBOdCf02M1l6AKxkwD4nweo= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=LFjN+VuE; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="LFjN+VuE" Received: by smtp.kernel.org (Postfix) with ESMTPSA id EA1C0C116C6; Tue, 27 Jan 2026 19:30:12 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542218; bh=0i1d2fpNNGr7J2FXKMkhDUOQSLx8UrCEb1MMN67S4zY=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=LFjN+VuEwQhJBIsfMbC+PmRxi9KIPVJN4uRtNpCoxNN9r9XQMLQHf0SUC/QcdqCdv yTbesD04KbeYLsqu9jsK5VCgiKMXWXlhDdWUti9o3laDuHpqKq0cyQpRIpTuVRDK3G 9s8YlgqtFbLBtQtxXOEyaDTUXUdJl17RWcVocISkGcSkSmsU5NvoiKCQikZgy87kFF O8R4uqwxHCNJdejR+L/GTxAWcoaC7L8L3E8vakXLO1fq4oq3POjF84JIKWuCS7CIvg pmavXqI0eF6TLmjO0g1n2PwhdMCDUBPPfgqRUjUwbI3BRxSmDJW4/QV6Mt7Z+eEND/ 4/xB2oNKKGPkg== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH RFC 05/17] userfaultfd: retry copying with locks dropped in mfill_atomic_pte_copy() Date: Tue, 27 Jan 2026 21:29:24 +0200 Message-ID: <20260127192936.1250096-6-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Mike Rapoport (Microsoft)" Implementation of UFFDIO_COPY for anonymous memory might fail to copy data data from userspace buffer when the destination VMA is locked (either with mm_lock or with per-VMA lock). In that case, mfill_atomic() releases the locks, retries copying the data with locks dropped and then re-locks the destination VMA and re-establishes PMD. Since this retry-reget dance is only relevant for UFFDIO_COPY and it never happens for other UFFDIO_ operations, make it a part of mfill_atomic_pte_copy() that actually implements UFFDIO_COPY for anonymous memory. shmem implementation will be updated later and the loop in mfill_atomic() will be adjusted afterwards. Signed-off-by: Mike Rapoport (Microsoft) --- mm/userfaultfd.c | 70 +++++++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 24 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 45d8f04aaf4f..01a2b898fa40 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -404,35 +404,57 @@ static int mfill_copy_folio_locked(struct folio *foli= o, unsigned long src_addr) return ret; } =20 +static int mfill_copy_folio_retry(struct mfill_state *state, struct folio = *folio) +{ + unsigned long src_addr =3D state->src_addr; + void *kaddr; + int err; + + /* retry copying with mm_lock dropped */ + mfill_put_vma(state); + + kaddr =3D kmap_local_folio(folio, 0); + err =3D copy_from_user(kaddr, (const void __user *) src_addr, PAGE_SIZE); + kunmap_local(kaddr); + if (unlikely(err)) + return -EFAULT; + + flush_dcache_folio(folio); + + /* reget VMA and PMD, they could change underneath us */ + err =3D mfill_get_vma(state); + if (err) + return err; + + err =3D mfill_get_pmd(state); + if (err) + return err; + + return 0; +} + static int mfill_atomic_pte_copy(struct mfill_state *state) { - struct vm_area_struct *dst_vma =3D state->vma; unsigned long dst_addr =3D state->dst_addr; unsigned long src_addr =3D state->src_addr; uffd_flags_t flags =3D state->flags; - pmd_t *dst_pmd =3D state->pmd; struct folio *folio; int ret; =20 - if (!state->folio) { - ret =3D -ENOMEM; - folio =3D vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, - dst_addr); - if (!folio) - goto out; + folio =3D vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, state->vma, dst_addr); + if (!folio) + return -ENOMEM; =20 - ret =3D mfill_copy_folio_locked(folio, src_addr); + ret =3D -ENOMEM; + if (mem_cgroup_charge(folio, state->vma->vm_mm, GFP_KERNEL)) + goto out_release; =20 + ret =3D mfill_copy_folio_locked(folio, src_addr); + if (unlikely(ret)) { /* fallback to copy_from_user outside mmap_lock */ - if (unlikely(ret)) { - ret =3D -ENOENT; - state->folio =3D folio; - /* don't free the page */ - goto out; - } - } else { - folio =3D state->folio; - state->folio =3D NULL; + ret =3D mfill_copy_folio_retry(state, folio); + if (ret) + goto out_release; } =20 /* @@ -442,17 +464,16 @@ static int mfill_atomic_pte_copy(struct mfill_state *= state) */ __folio_mark_uptodate(folio); =20 - ret =3D -ENOMEM; - if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) - goto out_release; - - ret =3D mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, + ret =3D mfill_atomic_install_pte(state->pmd, state->vma, dst_addr, &folio->page, true, flags); if (ret) goto out_release; out: return ret; out_release: + /* Don't return -ENOENT so that our caller won't retry */ + if (ret =3D=3D -ENOENT) + ret =3D -EFAULT; folio_put(folio); goto out; } @@ -907,7 +928,8 @@ static __always_inline ssize_t mfill_atomic(struct user= faultfd_ctx *ctx, break; } =20 - mfill_put_vma(&state); + if (state.vma) + mfill_put_vma(&state); out: if (state.folio) folio_put(state.folio); --=20 2.51.0 From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7E15E36CDFD; Tue, 27 Jan 2026 19:30:25 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542225; cv=none; b=Uale230WFZUDc69dnX5aJsa4J95+G4XCZE4nmdzdH3TnLrZC9aO20oHFFvZ8s7JxV6phd0HBAg9s3OBui9iBQgF/7DjKJVEn8r5yZq0qvZ04IIXm5dpxw3ahAzNpwOomHRthTTA+Y4H6U/1ZzvP6nrxfbcE5TjDckxeqcFKY3jk= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542225; c=relaxed/simple; bh=nTxpSE/9FhIsTCsQqn23J/LwTg91bg9I5z/zHIxMJv4=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=VrJJIQMG0JZLSFaYJr1AqNvNHQW3lCupSbqBoIkz3KlUfKA39Sc9KB5tD+3WUzzoZPCIz3600J+mH3IIoHgLXJeEyQNiP3hrekspmeahRZj0n37hO5jZVt3ol9BEH4qRe8Cq2U/UoAWSR0eo9FJFmCrF1XCQf6tySjO1LfyzbAw= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=rrHUkz6C; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="rrHUkz6C" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 34C5DC19422; Tue, 27 Jan 2026 19:30:18 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542225; bh=nTxpSE/9FhIsTCsQqn23J/LwTg91bg9I5z/zHIxMJv4=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=rrHUkz6C+MTinzzGGVgkyaol0zc47wA0yvVmurTkfu0cKRYazU169tJNe5tHoevx6 7/wZJgSSUqc5jWeKq7JYrng3nrM7JEuZHMfX2JwK/M/NMa89Q9/dm9nQapAqcec1Lp hK0/ivMIVrASZXYbIjVZBnt2bP9Xcel73hekQ+KPat13JeYFhjjONJA5tSZtsuAJvp 9zdgPk/Vu+Nycb2jm5R/WScVzGZ2F+4bgKBBNxuKbfzPub6nJJo/4XnV3DqNSmI/FG EoXI+WhiNrrslRwS/DZxR8cZYDQe9w6bmIcDDG2XVJXLkOkKjci29ietb8D4VCEdQt 3K7FK4bRUM9KA== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org, "David Hildenbrand (Red Hat)" Subject: [PATCH RFC 06/17] userfaultfd: move vma_can_userfault out of line Date: Tue, 27 Jan 2026 21:29:25 +0200 Message-ID: <20260127192936.1250096-7-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Mike Rapoport (Microsoft)" vma_can_userfault() has grown pretty big and it's not called on performance critical path. Move it out of line. No functional changes. Reviewed-by: David Hildenbrand (Red Hat) Reviewed-by: Liam R. Howlett Signed-off-by: Mike Rapoport (Microsoft) --- include/linux/userfaultfd_k.h | 35 ++--------------------------------- mm/userfaultfd.c | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index fd5f42765497..a49cf750e803 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -208,39 +208,8 @@ static inline bool userfaultfd_armed(struct vm_area_st= ruct *vma) return vma->vm_flags & __VM_UFFD_FLAGS; } =20 -static inline bool vma_can_userfault(struct vm_area_struct *vma, - vm_flags_t vm_flags, - bool wp_async) -{ - vm_flags &=3D __VM_UFFD_FLAGS; - - if (vma->vm_flags & VM_DROPPABLE) - return false; - - if ((vm_flags & VM_UFFD_MINOR) && - (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) - return false; - - /* - * If wp async enabled, and WP is the only mode enabled, allow any - * memory type. - */ - if (wp_async && (vm_flags =3D=3D VM_UFFD_WP)) - return true; - - /* - * If user requested uffd-wp but not enabled pte markers for - * uffd-wp, then shmem & hugetlbfs are not supported but only - * anonymous. - */ - if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && - !vma_is_anonymous(vma)) - return false; - - /* By default, allow any of anon|shmem|hugetlb */ - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); -} +bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, + bool wp_async); =20 static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct = *vma) { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 01a2b898fa40..786f0a245675 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -2016,6 +2016,39 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsi= gned long dst_start, return moved ? moved : err; } =20 +bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, + bool wp_async) +{ + vm_flags &=3D __VM_UFFD_FLAGS; + + if (vma->vm_flags & VM_DROPPABLE) + return false; + + if ((vm_flags & VM_UFFD_MINOR) && + (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) + return false; + + /* + * If wp async enabled, and WP is the only mode enabled, allow any + * memory type. + */ + if (wp_async && (vm_flags =3D=3D VM_UFFD_WP)) + return true; + + /* + * If user requested uffd-wp but not enabled pte markers for + * uffd-wp, then shmem & hugetlbfs are not supported but only + * anonymous. + */ + if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && + !vma_is_anonymous(vma)) + return false; + + /* By default, allow any of anon|shmem|hugetlb */ + return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || + vma_is_shmem(vma); +} + static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, vm_flags_t vm_flags) { --=20 2.51.0 From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id CAA0D36CDFD; Tue, 27 Jan 2026 19:30:31 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542231; cv=none; b=cH2d7rX0r4dMUF6kL7VUbe0dyRe5ZJ1eEK3UC9hlFgt0ZzIHSuHpMyGhXmDgY8dFR4xKXZXz3TGgbkbmI3yoI1lzqnzFJu9HFeeb+UNC8XUFZhJS+Tg41aSZjEU9KiSwdj9F+HdnmL0WpeEwaI7TpHwrPz0yl+bvu4PUYnWu3XY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542231; c=relaxed/simple; bh=i5BbhYRwDwFgqsHdXlkHyjBIxCxCm4W2Vuq3+WqFKuE=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=jsLWg+/ZdNffXi82rAeCTg0sYsKxOThkXsQlf6YIp5BjTZzwtVqGRxj2eVuyFJgsXiIXVbYXunof5+OKxQu/mKERTIZDeQ5q4iG3lF9tVlmGAA/9pmrWQtp241XIsn+8m/67d+8Sv/CtHkLC2l/5mNj+PAfxsYfmBtLm6v4xmQI= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=t5//S+1U; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="t5//S+1U" Received: by smtp.kernel.org (Postfix) with ESMTPSA id AA792C116C6; Tue, 27 Jan 2026 19:30:25 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542231; bh=i5BbhYRwDwFgqsHdXlkHyjBIxCxCm4W2Vuq3+WqFKuE=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=t5//S+1UtIYnXDs6Hs6NxKOKKNNWqptdM+gvy6dRVE+BudwOJTk0JU18Kgu8Is/i8 S4mij5QllCQ0bLDP9j4PXhyhphk6rZsJnwO66kQ2KnBjlo98h9WsU+/CtVyuyfrDcb YaDIC2cQY75LBXYYQ6dkdIl4lRxAqFoZ+QW+6+mHOUU/uxwZSmtZvmd9yKdmJePJbh 7icJnfNaiIn2NFwCBRLDTEeXkx8acI0L8/XRNxVWlWsjzzJK+zvIf6d5ZkMJ2/yiYO 82FLdFAV0IXtjX6eBrCoibCK0vbQ+FMcvnhGLpbqCi9QS6JQProWOOMP0S1nskNgaR ARQpR/ChNCNDw== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH RFC 07/17] userfaultfd: introduce vm_uffd_ops Date: Tue, 27 Jan 2026 21:29:26 +0200 Message-ID: <20260127192936.1250096-8-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Mike Rapoport (Microsoft)" Current userfaultfd implementation works only with memory managed by core MM: anonymous, shmem and hugetlb. First, there is no fundamental reason to limit userfaultfd support only to the core memory types and userfaults can be handled similarly to regular page faults provided a VMA owner implements appropriate callbacks. Second, historically various code paths were conditioned on vma_is_anonymous(), vma_is_shmem() and is_vm_hugetlb_page() and some of these conditions can be expressed as operations implemented by a particular memory type. Introduce vm_uffd_ops extension to vm_operations_struct that will delegate memory type specific operations to a VMA owner. Operations for anonymous memory are handled internally in userfaultfd using anon_uffd_ops that implicitly assigned to anonymous VMAs. Start with a single operation, ->can_userfault() that will verify that a VMA meets requirements for userfaultfd support at registration time. Implement that method for anonymous, shmem and hugetlb and move relevant parts of vma_can_userfault() into the new callbacks. Signed-off-by: Mike Rapoport (Microsoft) --- include/linux/mm.h | 5 +++++ include/linux/userfaultfd_k.h | 6 +++++ mm/hugetlb.c | 21 ++++++++++++++++++ mm/shmem.c | 23 ++++++++++++++++++++ mm/userfaultfd.c | 41 ++++++++++++++++++++++------------- 5 files changed, 81 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 15076261d0c2..3c2caff646c3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -732,6 +732,8 @@ struct vm_fault { */ }; =20 +struct vm_uffd_ops; + /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -817,6 +819,9 @@ struct vm_operations_struct { struct page *(*find_normal_page)(struct vm_area_struct *vma, unsigned long addr); #endif /* CONFIG_FIND_NORMAL_PAGE */ +#ifdef CONFIG_USERFAULTFD + const struct vm_uffd_ops *uffd_ops; +#endif }; =20 #ifdef CONFIG_NUMA_BALANCING diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index a49cf750e803..56e85ab166c7 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -80,6 +80,12 @@ struct userfaultfd_ctx { =20 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long rea= son); =20 +/* VMA userfaultfd operations */ +struct vm_uffd_ops { + /* Checks if a VMA can support userfaultfd */ + bool (*can_userfault)(struct vm_area_struct *vma, vm_flags_t vm_flags); +}; + /* A combined operation mode + behavior flags. */ typedef unsigned int __bitwise uffd_flags_t; =20 diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 51273baec9e5..909131910c43 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4797,6 +4797,24 @@ static vm_fault_t hugetlb_vm_op_fault(struct vm_faul= t *vmf) return 0; } =20 +#ifdef CONFIG_USERFAULTFD +static bool hugetlb_can_userfault(struct vm_area_struct *vma, + vm_flags_t vm_flags) +{ + /* + * If user requested uffd-wp but not enabled pte markers for + * uffd-wp, then hugetlb is not supported. + */ + if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP)) + return false; + return true; +} + +static const struct vm_uffd_ops hugetlb_uffd_ops =3D { + .can_userfault =3D hugetlb_can_userfault, +}; +#endif + /* * When a new function is introduced to vm_operations_struct and added * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops. @@ -4810,6 +4828,9 @@ const struct vm_operations_struct hugetlb_vm_ops =3D { .close =3D hugetlb_vm_op_close, .may_split =3D hugetlb_vm_op_split, .pagesize =3D hugetlb_vm_op_pagesize, +#ifdef CONFIG_USERFAULTFD + .uffd_ops =3D &hugetlb_uffd_ops, +#endif }; =20 static pte_t make_huge_pte(struct vm_area_struct *vma, struct folio *folio, diff --git a/mm/shmem.c b/mm/shmem.c index ec6c01378e9d..9b82cda271c4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5290,6 +5290,23 @@ static const struct super_operations shmem_ops =3D { #endif }; =20 +#ifdef CONFIG_USERFAULTFD +static bool shmem_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_= flags) +{ + /* + * If user requested uffd-wp but not enabled pte markers for + * uffd-wp, then shmem is not supported. + */ + if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP)) + return false; + return true; +} + +static const struct vm_uffd_ops shmem_uffd_ops =3D { + .can_userfault =3D shmem_can_userfault, +}; +#endif + static const struct vm_operations_struct shmem_vm_ops =3D { .fault =3D shmem_fault, .map_pages =3D filemap_map_pages, @@ -5297,6 +5314,9 @@ static const struct vm_operations_struct shmem_vm_ops= =3D { .set_policy =3D shmem_set_policy, .get_policy =3D shmem_get_policy, #endif +#ifdef CONFIG_USERFAULTFD + .uffd_ops =3D &shmem_uffd_ops, +#endif }; =20 static const struct vm_operations_struct shmem_anon_vm_ops =3D { @@ -5306,6 +5326,9 @@ static const struct vm_operations_struct shmem_anon_v= m_ops =3D { .set_policy =3D shmem_set_policy, .get_policy =3D shmem_get_policy, #endif +#ifdef CONFIG_USERFAULTFD + .uffd_ops =3D &shmem_uffd_ops, +#endif }; =20 int shmem_init_fs_context(struct fs_context *fc) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 786f0a245675..d035f5e17f07 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -34,6 +34,25 @@ struct mfill_state { pmd_t *pmd; }; =20 +static bool anon_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_f= lags) +{ + /* anonymous memory does not support MINOR mode */ + if (vm_flags & VM_UFFD_MINOR) + return false; + return true; +} + +static const struct vm_uffd_ops anon_uffd_ops =3D { + .can_userfault =3D anon_can_userfault, +}; + +static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return &anon_uffd_ops; + return vma->vm_ops ? vma->vm_ops->uffd_ops : NULL; +} + static __always_inline bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_en= d) { @@ -2019,13 +2038,15 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, uns= igned long dst_start, bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, bool wp_async) { - vm_flags &=3D __VM_UFFD_FLAGS; + const struct vm_uffd_ops *ops =3D vma_uffd_ops(vma); =20 - if (vma->vm_flags & VM_DROPPABLE) + /* only VMAs that implement vm_uffd_ops are supported */ + if (!ops) return false; =20 - if ((vm_flags & VM_UFFD_MINOR) && - (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) + vm_flags &=3D __VM_UFFD_FLAGS; + + if (vma->vm_flags & VM_DROPPABLE) return false; =20 /* @@ -2035,18 +2056,8 @@ bool vma_can_userfault(struct vm_area_struct *vma, v= m_flags_t vm_flags, if (wp_async && (vm_flags =3D=3D VM_UFFD_WP)) return true; =20 - /* - * If user requested uffd-wp but not enabled pte markers for - * uffd-wp, then shmem & hugetlbfs are not supported but only - * anonymous. - */ - if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && - !vma_is_anonymous(vma)) - return false; - /* By default, allow any of anon|shmem|hugetlb */ - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); + return ops->can_userfault(vma, vm_flags); } =20 static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, --=20 2.51.0 From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C1D7136CDFD; Tue, 27 Jan 2026 19:30:37 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542237; cv=none; b=Y4G6clJxAgZkUUx+ANmOJreklJ9s7dIegsBKmKr8yZnhLNuqmGEBh4WoQklimPAuvMPXgsrcV4imYyMWY+ixePW9O768D3y6bZcG6Wdg8xCNp5RH8Kh0WqyZNRYrrWsum+IcFGDSkrGzSDDkARWkQbZLSDCRUjl25OdZzVaT/1E= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542237; c=relaxed/simple; bh=dEjTs4g7Ldzs6VFuT/arDcgalMO2GCfU8tcBpfF3p8M=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=AEbqZ3Xyg07cLcBUh8o0L7gJ+NeHZo0KWNOWDxG6rxC2nsf4XzEYiIu4xy55yeWE1PRfjbQ4xOMgx1sS6N+nxpaxQ9Oojv/CfsUk+kcSjY3brPrF5lfQSBbrBycMpZmMA1YU/+nTLYK8EOG7j8BrwGv6LskQBvM/yyXRflgot/Y= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=ZIFX02kq; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="ZIFX02kq" Received: by smtp.kernel.org (Postfix) with ESMTPSA id E9613C19422; Tue, 27 Jan 2026 19:30:31 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542237; bh=dEjTs4g7Ldzs6VFuT/arDcgalMO2GCfU8tcBpfF3p8M=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=ZIFX02kq/eb3tdlUyK3+ux/FsIlLOYCqnviY6Ie7P/JoMu0xpvH0YAzid+JPgh/wf ZeqTvHI90vv1ymfJ5mKlTSi7ENxzGNs0eiDNpe7SKCKi9YBeblG/mupcmyOBTb5lfi Du+zzNcONbAQnPXllF3puU5zkWcn3oJA5C6i24cg0xb6cyKMIBRz1sxJDbetb+GomZ ebsMHkfKrgYOtfle3aDrqX/AIg7yiUZwWzJrbevM5HsVJ7crsBY9sWOs1mGJLRC+4t QtQE0BT8W2UM4lZNTM8NJDoKhmUDX5AtUDIRlY1FGik8uzBb19PN6zjwHOtAQvIeoq e5OQTmT8pAcFQ== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH RFC 08/17] userfaultfd, shmem: use a VMA callback to handle UFFDIO_CONTINUE Date: Tue, 27 Jan 2026 21:29:27 +0200 Message-ID: <20260127192936.1250096-9-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Mike Rapoport (Microsoft)" When userspace resolves a page fault in a shmem VMA with UFFDIO_CONTINUE it needs to get a folio that already exists in the pagecache backing that VMA. Instead of using shmem_get_folio() for that, add a get_folio_noalloc() method to 'struct vm_uffd_ops' that will return a folio if it exists in the VMA's pagecache at given pgoff. Implement get_folio_noalloc() method for shmem and slightly refactor userfaultfd's mfill_get_vma() and mfill_atomic_pte_continue() to support this new API. Signed-off-by: Mike Rapoport (Microsoft) --- include/linux/userfaultfd_k.h | 7 +++++++ mm/shmem.c | 15 ++++++++++++++- mm/userfaultfd.c | 32 ++++++++++++++++---------------- 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 56e85ab166c7..66dfc3c164e6 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -84,6 +84,13 @@ extern vm_fault_t handle_userfault(struct vm_fault *vmf,= unsigned long reason); struct vm_uffd_ops { /* Checks if a VMA can support userfaultfd */ bool (*can_userfault)(struct vm_area_struct *vma, vm_flags_t vm_flags); + /* + * Called to resolve UFFDIO_CONTINUE request. + * Should return the folio found at pgoff in the VMA's pagecache if it + * exists or ERR_PTR otherwise. + * The returned folio is locked and with reference held. + */ + struct folio *(*get_folio_noalloc)(struct inode *inode, pgoff_t pgoff); }; =20 /* A combined operation mode + behavior flags. */ diff --git a/mm/shmem.c b/mm/shmem.c index 9b82cda271c4..87cd8d2fdb97 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5291,6 +5291,18 @@ static const struct super_operations shmem_ops =3D { }; =20 #ifdef CONFIG_USERFAULTFD +static struct folio *shmem_get_folio_noalloc(struct inode *inode, pgoff_t = pgoff) +{ + struct folio *folio; + int err; + + err =3D shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); + if (err) + return ERR_PTR(err); + + return folio; +} + static bool shmem_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_= flags) { /* @@ -5303,7 +5315,8 @@ static bool shmem_can_userfault(struct vm_area_struct= *vma, vm_flags_t vm_flags) } =20 static const struct vm_uffd_ops shmem_uffd_ops =3D { - .can_userfault =3D shmem_can_userfault, + .can_userfault =3D shmem_can_userfault, + .get_folio_noalloc =3D shmem_get_folio_noalloc, }; #endif =20 diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index d035f5e17f07..f0e6336015f1 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -188,6 +188,7 @@ static int mfill_get_vma(struct mfill_state *state) struct userfaultfd_ctx *ctx =3D state->ctx; uffd_flags_t flags =3D state->flags; struct vm_area_struct *dst_vma; + const struct vm_uffd_ops *ops; int err; =20 /* @@ -228,10 +229,12 @@ static int mfill_get_vma(struct mfill_state *state) if (is_vm_hugetlb_page(dst_vma)) goto out; =20 - if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) + ops =3D vma_uffd_ops(dst_vma); + if (!ops) goto out_unlock; - if (!vma_is_shmem(dst_vma) && - uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) + + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) && + !ops->get_folio_noalloc) goto out_unlock; =20 out: @@ -568,6 +571,7 @@ static int mfill_atomic_pte_zeropage(struct mfill_state= *state) static int mfill_atomic_pte_continue(struct mfill_state *state) { struct vm_area_struct *dst_vma =3D state->vma; + const struct vm_uffd_ops *ops =3D vma_uffd_ops(dst_vma); unsigned long dst_addr =3D state->dst_addr; pgoff_t pgoff =3D linear_page_index(dst_vma, dst_addr); struct inode *inode =3D file_inode(dst_vma->vm_file); @@ -577,16 +581,13 @@ static int mfill_atomic_pte_continue(struct mfill_sta= te *state) struct page *page; int ret; =20 - ret =3D shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); + if (!ops) + return -EOPNOTSUPP; + + folio =3D ops->get_folio_noalloc(inode, pgoff); /* Our caller expects us to return -EFAULT if we failed to find folio */ - if (ret =3D=3D -ENOENT) - ret =3D -EFAULT; - if (ret) - goto out; - if (!folio) { - ret =3D -EFAULT; - goto out; - } + if (IS_ERR_OR_NULL(folio)) + return -EFAULT; =20 page =3D folio_file_page(folio, pgoff); if (PageHWPoison(page)) { @@ -600,13 +601,12 @@ static int mfill_atomic_pte_continue(struct mfill_sta= te *state) goto out_release; =20 folio_unlock(folio); - ret =3D 0; -out: - return ret; + return 0; + out_release: folio_unlock(folio); folio_put(folio); - goto out; + return ret; } =20 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ --=20 2.51.0 From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 54ED8299922; Tue, 27 Jan 2026 19:30:44 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542244; cv=none; b=i0s1oXcrO3VAwxq3Jn3abwlakS/bxOlLs4P78XwrdvSKb5zUkvJGDbHWWH5s7mF06zkT9Hm7ZNUHpxVmzfUDOsdmlXgk+X5n/a+DOCDfD28ynxwx0HUv7HJ4WQHRdrBiEnyXYXHn9uLw9Mf9oTmHNUVuMHjj8nFMSQ5pcPlV6aw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542244; c=relaxed/simple; bh=gOZGYyaN4+ceUz7X3a7wO7Gkwgg3l4aHlNGikBmh8yI=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=aFBU4ZtVMbjrqO20Gsx+8ymGuzfYlk74c02RG/mLMJVdrieQXXVXbRKbVeSs7sQ1QhAIHuoeTiZgzQvpiSGd8/JfoVbf2WGuLvzZeun4mR0syKqUmX5oneDpnok1Vdh3nSvOkCqxVGJSCRbk4iajQEsq+yrX8t5p9qVdSOdi4Xk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=LXykKUde; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="LXykKUde" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 349FFC19425; Tue, 27 Jan 2026 19:30:37 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542243; bh=gOZGYyaN4+ceUz7X3a7wO7Gkwgg3l4aHlNGikBmh8yI=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=LXykKUdeTd2HBkgc2hP7eZb73bNLFL1p0lWi1DNQnLh1pAQdYs0qjNCWjTFQ2H8cL uO+5GwnBXEAb1JdsiscE81/F86rJ7Pqwk2ZTSAq5LiCkZslorgwa4m/PTS+/V1f4Uv F9WDlZuJMsfFFhXd3YkOKnwd36NqNyu1sSKWDujHgEB9BrSVcQk01TWF0KbFP2IiDZ pMau7AKFfeexKFeL1N4B0sigcjJgBDW8IsZ5Qoxmgeg34o3B3zU+ucLgnnhPLz70Po opITgJ5F1yXFHFlDeTepWkBXBq/3bO8HJ0yRKsRKlyu3ozcqqcqoCoDFe5BFq8pQRc 7I9O2ABBq9LFw== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH RFC 09/17] userfaultfd: introduce vm_uffd_ops->alloc_folio() Date: Tue, 27 Jan 2026 21:29:28 +0200 Message-ID: <20260127192936.1250096-10-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Mike Rapoport (Microsoft)" and use it to refactor mfill_atomic_pte_zeroed_folio() and mfill_atomic_pte_copy(). mfill_atomic_pte_zeroed_folio() and mfill_atomic_pte_copy() perform almost identical actions: * allocate a folio * update folio contents (either copy from userspace of fill with zeros) * update page tables with the new folio Split a __mfill_atomic_pte() helper that handles both cases and uses newly introduced vm_uffd_ops->alloc_folio() to allocate the folio. Pass the ops structure from the callers to __mfill_atomic_pte() to later allow using anon_uffd_ops for MAP_PRIVATE mappings of file-backed VMAs. Note, that the new ops method is called alloc_folio() rather than folio_alloc() to avoid clash with alloc_tag macro folio_alloc(). Signed-off-by: Mike Rapoport (Microsoft) --- include/linux/userfaultfd_k.h | 6 +++ mm/userfaultfd.c | 92 ++++++++++++++++++----------------- 2 files changed, 54 insertions(+), 44 deletions(-) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 66dfc3c164e6..4d8b879eed91 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -91,6 +91,12 @@ struct vm_uffd_ops { * The returned folio is locked and with reference held. */ struct folio *(*get_folio_noalloc)(struct inode *inode, pgoff_t pgoff); + /* + * Called during resolution of UFFDIO_COPY request. + * Should return allocate a and return folio or NULL if allocation fails. + */ + struct folio *(*alloc_folio)(struct vm_area_struct *vma, + unsigned long addr); }; =20 /* A combined operation mode + behavior flags. */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index f0e6336015f1..b3c12630769c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -42,8 +42,26 @@ static bool anon_can_userfault(struct vm_area_struct *vm= a, vm_flags_t vm_flags) return true; } =20 +static struct folio *anon_alloc_folio(struct vm_area_struct *vma, + unsigned long addr) +{ + struct folio *folio =3D vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, + addr); + + if (!folio) + return NULL; + + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { + folio_put(folio); + return NULL; + } + + return folio; +} + static const struct vm_uffd_ops anon_uffd_ops =3D { .can_userfault =3D anon_can_userfault, + .alloc_folio =3D anon_alloc_folio, }; =20 static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma) @@ -455,7 +473,8 @@ static int mfill_copy_folio_retry(struct mfill_state *s= tate, struct folio *folio return 0; } =20 -static int mfill_atomic_pte_copy(struct mfill_state *state) +static int __mfill_atomic_pte(struct mfill_state *state, + const struct vm_uffd_ops *ops) { unsigned long dst_addr =3D state->dst_addr; unsigned long src_addr =3D state->src_addr; @@ -463,20 +482,22 @@ static int mfill_atomic_pte_copy(struct mfill_state *= state) struct folio *folio; int ret; =20 - folio =3D vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, state->vma, dst_addr); + folio =3D ops->alloc_folio(state->vma, state->dst_addr); if (!folio) return -ENOMEM; =20 - ret =3D -ENOMEM; - if (mem_cgroup_charge(folio, state->vma->vm_mm, GFP_KERNEL)) - goto out_release; - - ret =3D mfill_copy_folio_locked(folio, src_addr); - if (unlikely(ret)) { + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { + ret =3D mfill_copy_folio_locked(folio, src_addr); /* fallback to copy_from_user outside mmap_lock */ - ret =3D mfill_copy_folio_retry(state, folio); - if (ret) - goto out_release; + if (unlikely(ret)) { + ret =3D mfill_copy_folio_retry(state, folio); + if (ret) + goto err_folio_put; + } + } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { + clear_user_highpage(&folio->page, state->dst_addr); + } else { + VM_WARN_ONCE(1, "unknown UFFDIO operation"); } =20 /* @@ -489,47 +510,30 @@ static int mfill_atomic_pte_copy(struct mfill_state *= state) ret =3D mfill_atomic_install_pte(state->pmd, state->vma, dst_addr, &folio->page, true, flags); if (ret) - goto out_release; -out: - return ret; -out_release: + goto err_folio_put; + + return 0; + +err_folio_put: + folio_put(folio); /* Don't return -ENOENT so that our caller won't retry */ if (ret =3D=3D -ENOENT) ret =3D -EFAULT; - folio_put(folio); - goto out; + return ret; } =20 -static int mfill_atomic_pte_zeroed_folio(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) +static int mfill_atomic_pte_copy(struct mfill_state *state) { - struct folio *folio; - int ret =3D -ENOMEM; - - folio =3D vma_alloc_zeroed_movable_folio(dst_vma, dst_addr); - if (!folio) - return ret; - - if (mem_cgroup_charge(folio, dst_vma->vm_mm, GFP_KERNEL)) - goto out_put; + const struct vm_uffd_ops *ops =3D vma_uffd_ops(state->vma); =20 - /* - * The memory barrier inside __folio_mark_uptodate makes sure that - * zeroing out the folio become visible before mapping the page - * using set_pte_at(). See do_anonymous_page(). - */ - __folio_mark_uptodate(folio); + return __mfill_atomic_pte(state, ops); +} =20 - ret =3D mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - &folio->page, true, 0); - if (ret) - goto out_put; +static int mfill_atomic_pte_zeroed_folio(struct mfill_state *state) +{ + const struct vm_uffd_ops *ops =3D vma_uffd_ops(state->vma); =20 - return 0; -out_put: - folio_put(folio); - return ret; + return __mfill_atomic_pte(state, ops); } =20 static int mfill_atomic_pte_zeropage(struct mfill_state *state) @@ -542,7 +546,7 @@ static int mfill_atomic_pte_zeropage(struct mfill_state= *state) int ret; =20 if (mm_forbids_zeropage(dst_vma->vm_mm)) - return mfill_atomic_pte_zeroed_folio(dst_pmd, dst_vma, dst_addr); + return mfill_atomic_pte_zeroed_folio(state); =20 _dst_pte =3D pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), dst_vma->vm_page_prot)); --=20 2.51.0 From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 90C09299922; Tue, 27 Jan 2026 19:30:50 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542250; cv=none; b=lMTW+k4JbKJPC59gw7YttfLXK7aZsUbAVwBBuXGDFr4eaTyT6YXhW7PO3czYL9AeRSjpBJX8zs3KEsW55yqfK6XXcRl20l54x8+w0ZPyp1sMD1iL/BJbR/MMBlLaylGGg/QUMsdMM4V6p1futffBxDbqNKvC9BAE24XP/zSskoI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542250; c=relaxed/simple; bh=Xb+xc8R1aRP1DZaVc1IOpEFUHCEmFeum4D1XaFDWybk=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=UVEqNyvn0sBbkOluGEVh2/I+u20ujF18xawwRVbofpMTLQkymLY5X09GZJz39na8WKqrxSrVd+Yx7+Wl4rgsDN8JR/f3iy4UxlZ4mBfpK9k9vA9Oujc7OTYl4a8AYpc6u4xDZN3BSXnVi4Vj/p/1Mb++3nIl/D9dUSG0fjCwuMs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=UoCRp3/P; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="UoCRp3/P" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 748E7C116C6; Tue, 27 Jan 2026 19:30:44 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542250; bh=Xb+xc8R1aRP1DZaVc1IOpEFUHCEmFeum4D1XaFDWybk=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=UoCRp3/PvJtRL9uUKUMuSaU/ZNr6oeqV3RCuFhVjCgZd8x//VhYNUnnt5pcYsGuKT 5qhIL46SNavcTIZ8Jv5URb9fDi6KJcGK9SM+fLlgGQCVyP4ZUEm0hHP0yEw1xcLLRi FcQrnaLlXYSfgBnto6Nb3+N/k6lj4BlgUx8X1kyO8nCKkT/KSEvHP0lJrhaxo+VOUL MM5TwAybb4vkrOM3IqIcB1vfW1m4aUzGawv7JLMAGE+mYsZRdB3e5m7zH1ea6x6jcB 116+gB1VNxLP3DssltYW0BK53oQFU3I9kY0f6dGqyZ+NUzMS5G82qArY4QCT5jaT/K G2p6bpe8uuD6Q== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH RFC 10/17] shmem, userfaultfd: implement shmem uffd operations using vm_uffd_ops Date: Tue, 27 Jan 2026 21:29:29 +0200 Message-ID: <20260127192936.1250096-11-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Mike Rapoport (Microsoft)" Add filemap_add() and filemap_remove() methods to vm_uffd_ops and use them in __mfill_atomic_pte() to add shmem folios to page cache and remove them in case of error. Implement these methods in shmem along with vm_uffd_ops->alloc_folio() and drop shmem_mfill_atomic_pte(). Since userfaultfd now does not reference any functions from shmem, drop include if linux/shmem_fs.h from mm/userfaultfd.c mfill_atomic_install_pte() is not used anywhere outside of mm/userfaultfd, make it static. Signed-off-by: Mike Rapoport (Microsoft) fixup Signed-off-by: Mike Rapoport (Microsoft) --- include/linux/shmem_fs.h | 14 ---- include/linux/userfaultfd_k.h | 20 +++-- mm/shmem.c | 148 ++++++++++++---------------------- mm/userfaultfd.c | 79 +++++++++--------- 4 files changed, 106 insertions(+), 155 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index e2069b3179c4..754f17e5b53c 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -223,20 +223,6 @@ static inline pgoff_t shmem_fallocend(struct inode *in= ode, pgoff_t eof) =20 extern bool shmem_charge(struct inode *inode, long pages); =20 -#ifdef CONFIG_USERFAULTFD -#ifdef CONFIG_SHMEM -extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop); -#else /* !CONFIG_SHMEM */ -#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \ - src_addr, flags, foliop) ({ BUG(); 0; }) -#endif /* CONFIG_SHMEM */ -#endif /* CONFIG_USERFAULTFD */ - /* * Used space is stored as unsigned 64-bit value in bytes but * quota core supports only signed 64-bit values so use that diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 4d8b879eed91..75d5b09f2560 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -97,6 +97,21 @@ struct vm_uffd_ops { */ struct folio *(*alloc_folio)(struct vm_area_struct *vma, unsigned long addr); + /* + * Called during resolution of UFFDIO_COPY request. + * Should lock the folio and add it to VMA's page cache. + * Returns 0 on success, error code on failre. + */ + int (*filemap_add)(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr); + /* + * Called during resolution of UFFDIO_COPY request on the error + * handling path. + * Should revert the operation of ->filemap_add(). + * The folio should be unlocked, but the reference to it should not be + * dropped. + */ + void (*filemap_remove)(struct folio *folio, struct vm_area_struct *vma); }; =20 /* A combined operation mode + behavior flags. */ @@ -130,11 +145,6 @@ static inline uffd_flags_t uffd_flags_set_mode(uffd_fl= ags_t flags, enum mfill_at /* Flags controlling behavior. These behavior changes are mode-independent= . */ #define MFILL_ATOMIC_WP MFILL_ATOMIC_FLAG(0) =20 -extern int mfill_atomic_install_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, struct page *page, - bool newly_allocated, uffd_flags_t flags); - extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned lon= g dst_start, unsigned long src_start, unsigned long len, uffd_flags_t flags); diff --git a/mm/shmem.c b/mm/shmem.c index 87cd8d2fdb97..6f0485f76cb8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3169,118 +3169,73 @@ static inline struct inode *shmem_get_inode(struct= mnt_idmap *idmap, #endif /* CONFIG_TMPFS_QUOTA */ =20 #ifdef CONFIG_USERFAULTFD -int shmem_mfill_atomic_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - uffd_flags_t flags, - struct folio **foliop) -{ - struct inode *inode =3D file_inode(dst_vma->vm_file); - struct shmem_inode_info *info =3D SHMEM_I(inode); +static struct folio *shmem_mfill_folio_alloc(struct vm_area_struct *vma, + unsigned long addr) +{ + struct inode *inode =3D file_inode(vma->vm_file); struct address_space *mapping =3D inode->i_mapping; + struct shmem_inode_info *info =3D SHMEM_I(inode); + pgoff_t pgoff =3D linear_page_index(vma, addr); gfp_t gfp =3D mapping_gfp_mask(mapping); - pgoff_t pgoff =3D linear_page_index(dst_vma, dst_addr); - void *page_kaddr; struct folio *folio; - int ret; - pgoff_t max_off; - - if (shmem_inode_acct_blocks(inode, 1)) { - /* - * We may have got a page, returned -ENOENT triggering a retry, - * and now we find ourselves with -ENOMEM. Release the page, to - * avoid a BUG_ON in our caller. - */ - if (unlikely(*foliop)) { - folio_put(*foliop); - *foliop =3D NULL; - } - return -ENOMEM; - } =20 - if (!*foliop) { - ret =3D -ENOMEM; - folio =3D shmem_alloc_folio(gfp, 0, info, pgoff); - if (!folio) - goto out_unacct_blocks; + if (unlikely(pgoff >=3D DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) + return NULL; =20 - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) { - page_kaddr =3D kmap_local_folio(folio, 0); - /* - * The read mmap_lock is held here. Despite the - * mmap_lock being read recursive a deadlock is still - * possible if a writer has taken a lock. For example: - * - * process A thread 1 takes read lock on own mmap_lock - * process A thread 2 calls mmap, blocks taking write lock - * process B thread 1 takes page fault, read lock on own mmap lock - * process B thread 2 calls mmap, blocks taking write lock - * process A thread 1 blocks taking read lock on process B - * process B thread 1 blocks taking read lock on process A - * - * Disable page faults to prevent potential deadlock - * and retry the copy outside the mmap_lock. - */ - pagefault_disable(); - ret =3D copy_from_user(page_kaddr, - (const void __user *)src_addr, - PAGE_SIZE); - pagefault_enable(); - kunmap_local(page_kaddr); - - /* fallback to copy_from_user outside mmap_lock */ - if (unlikely(ret)) { - *foliop =3D folio; - ret =3D -ENOENT; - /* don't free the page */ - goto out_unacct_blocks; - } + folio =3D shmem_alloc_folio(gfp, 0, info, pgoff); + if (!folio) + return NULL; =20 - flush_dcache_folio(folio); - } else { /* ZEROPAGE */ - clear_user_highpage(&folio->page, dst_addr); - } - } else { - folio =3D *foliop; - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - *foliop =3D NULL; + if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL)) { + folio_put(folio); + return NULL; } =20 - VM_BUG_ON(folio_test_locked(folio)); - VM_BUG_ON(folio_test_swapbacked(folio)); + return folio; +} + +static int shmem_mfill_filemap_add(struct folio *folio, + struct vm_area_struct *vma, + unsigned long addr) +{ + struct inode *inode =3D file_inode(vma->vm_file); + struct address_space *mapping =3D inode->i_mapping; + pgoff_t pgoff =3D linear_page_index(vma, addr); + gfp_t gfp =3D mapping_gfp_mask(mapping); + int err; + __folio_set_locked(folio); __folio_set_swapbacked(folio); - __folio_mark_uptodate(folio); - - ret =3D -EFAULT; - max_off =3D DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(pgoff >=3D max_off)) - goto out_release; =20 - ret =3D mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); - if (ret) - goto out_release; - ret =3D shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); - if (ret) - goto out_release; + err =3D shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); + if (err) + goto err_unlock; =20 - ret =3D mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - &folio->page, true, flags); - if (ret) - goto out_delete_from_cache; + if (shmem_inode_acct_blocks(inode, 1)) { + err =3D -ENOMEM; + goto err_delete_from_cache; + } =20 + folio_add_lru(folio); shmem_recalc_inode(inode, 1, 0); - folio_unlock(folio); + return 0; -out_delete_from_cache: + +err_delete_from_cache: filemap_remove_folio(folio); -out_release: +err_unlock: + folio_unlock(folio); + return err; +} + +static void shmem_mfill_filemap_remove(struct folio *folio, + struct vm_area_struct *vma) +{ + struct inode *inode =3D file_inode(vma->vm_file); + + filemap_remove_folio(folio); + shmem_recalc_inode(inode, 0, 0); folio_unlock(folio); - folio_put(folio); -out_unacct_blocks: - shmem_inode_unacct_blocks(inode, 1); - return ret; } #endif /* CONFIG_USERFAULTFD */ =20 @@ -5317,6 +5272,9 @@ static bool shmem_can_userfault(struct vm_area_struct= *vma, vm_flags_t vm_flags) static const struct vm_uffd_ops shmem_uffd_ops =3D { .can_userfault =3D shmem_can_userfault, .get_folio_noalloc =3D shmem_get_folio_noalloc, + .alloc_folio =3D shmem_mfill_folio_alloc, + .filemap_add =3D shmem_mfill_filemap_add, + .filemap_remove =3D shmem_mfill_filemap_remove, }; #endif =20 diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index b3c12630769c..54aa195237ba 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include "internal.h" @@ -337,10 +336,10 @@ static bool mfill_file_over_size(struct vm_area_struc= t *dst_vma, * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both s= hmem * and anon, and for both shared and private VMAs. */ -int mfill_atomic_install_pte(pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, struct page *page, - bool newly_allocated, uffd_flags_t flags) +static int mfill_atomic_install_pte(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, struct page *page, + uffd_flags_t flags) { int ret; struct mm_struct *dst_mm =3D dst_vma->vm_mm; @@ -384,9 +383,6 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, goto out_unlock; =20 if (page_in_cache) { - /* Usually, cache pages are already added to LRU */ - if (newly_allocated) - folio_add_lru(folio); folio_add_file_rmap_pte(folio, page, dst_vma); } else { folio_add_new_anon_rmap(folio, dst_vma, dst_addr, RMAP_EXCLUSIVE); @@ -401,6 +397,9 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, =20 set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); =20 + if (page_in_cache) + folio_unlock(folio); + /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); ret =3D 0; @@ -507,13 +506,22 @@ static int __mfill_atomic_pte(struct mfill_state *sta= te, */ __folio_mark_uptodate(folio); =20 + if (ops->filemap_add) { + ret =3D ops->filemap_add(folio, state->vma, state->dst_addr); + if (ret) + goto err_folio_put; + } + ret =3D mfill_atomic_install_pte(state->pmd, state->vma, dst_addr, - &folio->page, true, flags); + &folio->page, flags); if (ret) - goto err_folio_put; + goto err_filemap_remove; =20 return 0; =20 +err_filemap_remove: + if (ops->filemap_remove) + ops->filemap_remove(folio, state->vma); err_folio_put: folio_put(folio); /* Don't return -ENOENT so that our caller won't retry */ @@ -526,6 +534,18 @@ static int mfill_atomic_pte_copy(struct mfill_state *s= tate) { const struct vm_uffd_ops *ops =3D vma_uffd_ops(state->vma); =20 + /* + * The normal page fault path for a MAP_PRIVATE mapping in a + * file-backed VMA will invoke the fault, fill the hole in the file and + * COW it right away. The result generates plain anonymous memory. + * So when we are asked to fill a hole in a MAP_PRIVATE mapping, we'll + * generate anonymous memory directly without actually filling the + * hole. For the MAP_PRIVATE case the robustness check only happens in + * the pagetable (to verify it's still none) and not in the page cache. + */ + if (!(state->vma->vm_flags & VM_SHARED)) + ops =3D &anon_uffd_ops; + return __mfill_atomic_pte(state, ops); } =20 @@ -545,7 +565,8 @@ static int mfill_atomic_pte_zeropage(struct mfill_state= *state) spinlock_t *ptl; int ret; =20 - if (mm_forbids_zeropage(dst_vma->vm_mm)) + if (mm_forbids_zeropage(dst_vma->vm_mm) || + (dst_vma->vm_flags & VM_SHARED)) return mfill_atomic_pte_zeroed_folio(state); =20 _dst_pte =3D pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), @@ -600,11 +621,10 @@ static int mfill_atomic_pte_continue(struct mfill_sta= te *state) } =20 ret =3D mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr, - page, false, flags); + page, flags); if (ret) goto out_release; =20 - folio_unlock(folio); return 0; =20 out_release: @@ -827,41 +847,18 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultf= d_ctx *ctx, =20 static __always_inline ssize_t mfill_atomic_pte(struct mfill_state *state) { - struct vm_area_struct *dst_vma =3D state->vma; - unsigned long src_addr =3D state->src_addr; - unsigned long dst_addr =3D state->dst_addr; - struct folio **foliop =3D &state->folio; uffd_flags_t flags =3D state->flags; - pmd_t *dst_pmd =3D state->pmd; - ssize_t err; =20 if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) return mfill_atomic_pte_continue(state); if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) return mfill_atomic_pte_poison(state); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) + return mfill_atomic_pte_copy(state); + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) + return mfill_atomic_pte_zeropage(state); =20 - /* - * The normal page fault path for a shmem will invoke the - * fault, fill the hole in the file and COW it right away. The - * result generates plain anonymous memory. So when we are - * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll - * generate anonymous memory directly without actually filling - * the hole. For the MAP_PRIVATE case the robustness check - * only happens in the pagetable (to verify it's still none) - * and not in the radix tree. - */ - if (!(dst_vma->vm_flags & VM_SHARED)) { - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) - err =3D mfill_atomic_pte_copy(state); - else - err =3D mfill_atomic_pte_zeropage(state); - } else { - err =3D shmem_mfill_atomic_pte(dst_pmd, dst_vma, - dst_addr, src_addr, - flags, foliop); - } - - return err; + return -EOPNOTSUPP; } =20 static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, --=20 2.51.0 From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D0B5232695F; Tue, 27 Jan 2026 19:30:56 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542256; cv=none; b=gcTlqUSIv6uYqdeYm6Tdg5fh2Ot2duKp++gyrFe/VnsNBBL2q/QH5caPFUQns+nyV5ghuqNXah+DZUkeAuG3vzOH/7Ye+eAe4/iipvwBnYOqxElPKvyUFIhh7DIRYuj7s4BNURaYRnZ2R7wPr+NfLb3PB3vA8dMmjPDX5Gi8VOU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542256; c=relaxed/simple; bh=dAICMUfnaufCCr/AJKZx+mvFFt/TvpVz3/Hz00SDDT0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=jhQ8iQcjnJLOpK5sENQAPiVamVwusXDkljlZ+dcRpiIXaf6rj48nCDpHGsWaNUxOi0QrZPOG7ZaMjvYms/ygcW0DOVMdT6tZjJd2CaioJt7L62fPUv20TIl8+9a+JePfBPsntUzILpT48e9QQoYocx2hRiDI0VUfwygFZbJOx5E= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=gwfokdNa; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="gwfokdNa" Received: by smtp.kernel.org (Postfix) with ESMTPSA id B3A4CC116C6; Tue, 27 Jan 2026 19:30:50 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542256; bh=dAICMUfnaufCCr/AJKZx+mvFFt/TvpVz3/Hz00SDDT0=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=gwfokdNanFjWz8h5LlR+icqZBH0klOoYnKAr1LNi95YICtl63fsrIcY8NnT+EiRGe yWtnqTwr+5yZpeYTvJL+8lRdpKwuDymh+P32EIO7DBV2T1c59DdkBL1R/WUsNWgMjo lwiKT5FEt89BPbSUwIaq5DpMdSJP8rTXn1u4K7DWOR24Jk6vMPpdim08J8mSngmaUC SPR+4MGDgZLIL7Ui9/esE9aoXfCdTxImkuayPuQIrY/yHzrePZ7E3MShe2i3YI3kJQ X9XKKQQ96ySvXVHKpmqecHcxy28t+t+LDSTtQZT/ngMyhyrkh3tbjgj2uPuiHiqjkZ T57CGZrW7YBBA== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH RFC 11/17] userfaultfd: mfill_atomic() remove retry logic Date: Tue, 27 Jan 2026 21:29:30 +0200 Message-ID: <20260127192936.1250096-12-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Mike Rapoport (Microsoft)" Since __mfill_atomic_pte() handles the retry for both anonymous and shmem, there is no need to retry copying the date from the userspace in the loop in mfill_atomic(). Drop the retry logic from mfill_atomic(). Signed-off-by: Mike Rapoport (Microsoft) --- mm/userfaultfd.c | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 54aa195237ba..1bd7631463c6 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -29,7 +29,6 @@ struct mfill_state { struct vm_area_struct *vma; unsigned long src_addr; unsigned long dst_addr; - struct folio *folio; pmd_t *pmd; }; =20 @@ -889,7 +888,6 @@ static __always_inline ssize_t mfill_atomic(struct user= faultfd_ctx *ctx, VM_WARN_ON_ONCE(src_start + len <=3D src_start); VM_WARN_ON_ONCE(dst_start + len <=3D dst_start); =20 -retry: err =3D mfill_get_vma(&state); if (err) goto out; @@ -916,26 +914,6 @@ static __always_inline ssize_t mfill_atomic(struct use= rfaultfd_ctx *ctx, err =3D mfill_atomic_pte(&state); cond_resched(); =20 - if (unlikely(err =3D=3D -ENOENT)) { - void *kaddr; - - mfill_put_vma(&state); - VM_WARN_ON_ONCE(!state.folio); - - kaddr =3D kmap_local_folio(state.folio, 0); - err =3D copy_from_user(kaddr, - (const void __user *)state.src_addr, - PAGE_SIZE); - kunmap_local(kaddr); - if (unlikely(err)) { - err =3D -EFAULT; - goto out; - } - flush_dcache_folio(state.folio); - goto retry; - } else - VM_WARN_ON_ONCE(state.folio); - if (!err) { state.dst_addr +=3D PAGE_SIZE; state.src_addr +=3D PAGE_SIZE; @@ -951,8 +929,6 @@ static __always_inline ssize_t mfill_atomic(struct user= faultfd_ctx *ctx, if (state.vma) mfill_put_vma(&state); out: - if (state.folio) - folio_put(state.folio); VM_WARN_ON_ONCE(copied < 0); VM_WARN_ON_ONCE(err > 0); VM_WARN_ON_ONCE(!copied && !err); --=20 2.51.0 From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0D2DB2BCF7F; Tue, 27 Jan 2026 19:31:03 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542263; cv=none; b=X04DsPDIwsdAoA7HU/u/h0O37C8c/x/vH48iGJe7ex+Y+uJQcS9PhAHKUjgi3TS7ZsCGRWgSmFHsZqAJYzemE5+edu200QrJ4vBTaxFarpZJwsotE8p8uqiXJCLK6IJnGcg6gQs3OJaHYX87uqP03F2tCRaejwEdR0e2deVXr3E= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542263; c=relaxed/simple; bh=h+TcgWCVPD30aK45h+wpUIJypZftXhAg5ucXDIazF9Y=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=D1B83Sg6OZEo5j58Fw/L9NN270Vzf/hmSuUmjTERsSUmidldnyQlWRlw1FotSUAOjy4TQN9jD8/DkZEldk0RTa6bvlr1+YstJzVf52n7uq1AZyY3gHE6jxDljHxZNE7esaveZERCOhPLXkO9La/skurdh7Y+AG3114L/1qcHXpw= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=q2UsiRPu; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="q2UsiRPu" Received: by smtp.kernel.org (Postfix) with ESMTPSA id F19B2C116C6; Tue, 27 Jan 2026 19:30:56 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542262; bh=h+TcgWCVPD30aK45h+wpUIJypZftXhAg5ucXDIazF9Y=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=q2UsiRPuaj/8L0lU3PZimX+KdU+NnUJ4T6kgHnK7oKJ7dnF20S1Wu60dtz3r+KU1H 0T+7qAEjj3JbVgEGy3R0Hx3DRfjX2B6I9PzRs1e6vap8/WGmkswqz84raN0EuooQAV 5E4A1pVzNprdPWdgwAmWXAr69bCVgPHlBxziKewff7xctTxCeA2TMO1wQzD8XqEI6f Ed4jSaIIhYNV6/OJpzSKzJ1lZMwUBOT5T1blV/qYiHyC03fgIpcZ1vmjVxxtXEnaon 0hSCYpfGeJVTvH9YpHOBZSgxib2aLHu9IQ7ZrqCo73Rgfq3eGnPiCbchrIFCt6eypg 1wteWBH/FqJkQ== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org, "David Hildenbrand (Red Hat)" Subject: [PATCH RFC 12/17] mm: introduce VM_FAULT_UFFD_MINOR fault reason Date: Tue, 27 Jan 2026 21:29:31 +0200 Message-ID: <20260127192936.1250096-13-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Mike Rapoport (Microsoft)" When a VMA is registered with userfaulfd in minor mode, its ->fault() method should check if a folio exists in the page cache and if yes ->fault() should call handle_userfault(VM_UFFD_MINOR). Instead of calling handle_userfault() directly from a specific ->fault() implementation introduce new fault reason VM_FAULT_UFFD_MINOR that will notify the core page fault handler that it should call handle_userfaultfd(VM_UFFD_MINOR) to complete a page fault. Replace a call to handle_userfault(VM_UFFD_MINOR) in shmem and use the new VM_FAULT_UFFD_MINOR there instead. For configurations that don't enable CONFIG_USERFAULTFD, VM_FAULT_UFFD_MINOR is set to 0. Suggested-by: David Hildenbrand (Red Hat) Signed-off-by: Nikita Kalyazin Signed-off-by: Mike Rapoport (Microsoft) --- include/linux/mm_types.h | 10 +++++++++- mm/memory.c | 5 ++++- mm/shmem.c | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 42af2292951d..b25ac322bfbf 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1555,6 +1555,8 @@ typedef __bitwise unsigned int vm_fault_t; * fsync() to complete (for synchronous page faults * in DAX) * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released + * @VM_FAULT_UFFD_MINOR: ->fault did not modify page tables and needs + * handle_userfault(VM_UFFD_MINOR) to complete * @VM_FAULT_HINDEX_MASK: mask HINDEX value * */ @@ -1572,6 +1574,11 @@ enum vm_fault_reason { VM_FAULT_DONE_COW =3D (__force vm_fault_t)0x001000, VM_FAULT_NEEDDSYNC =3D (__force vm_fault_t)0x002000, VM_FAULT_COMPLETED =3D (__force vm_fault_t)0x004000, +#ifdef CONFIG_USERFAULTFD + VM_FAULT_UFFD_MINOR =3D (__force vm_fault_t)0x008000, +#else + VM_FAULT_UFFD_MINOR =3D (__force vm_fault_t)0x000000, +#endif VM_FAULT_HINDEX_MASK =3D (__force vm_fault_t)0x0f0000, }; =20 @@ -1596,7 +1603,8 @@ enum vm_fault_reason { { VM_FAULT_FALLBACK, "FALLBACK" }, \ { VM_FAULT_DONE_COW, "DONE_COW" }, \ { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \ - { VM_FAULT_COMPLETED, "COMPLETED" } + { VM_FAULT_COMPLETED, "COMPLETED" }, \ + { VM_FAULT_UFFD_MINOR, "UFFD_MINOR" } =20 struct vm_special_mapping { const char *name; /* The name, e.g. "[vdso]". */ diff --git a/mm/memory.c b/mm/memory.c index 2a55edc48a65..fcb3e0c3113e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5319,8 +5319,11 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) =20 ret =3D vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | - VM_FAULT_DONE_COW))) + VM_FAULT_DONE_COW | VM_FAULT_UFFD_MINOR))) { + if (ret & VM_FAULT_UFFD_MINOR) + return handle_userfault(vmf, VM_UFFD_MINOR); return ret; + } =20 folio =3D page_folio(vmf->page); if (unlikely(PageHWPoison(vmf->page))) { diff --git a/mm/shmem.c b/mm/shmem.c index 6f0485f76cb8..6aa905147c0c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2481,7 +2481,7 @@ static int shmem_get_folio_gfp(struct inode *inode, p= goff_t index, if (folio && vma && userfaultfd_minor(vma)) { if (!xa_is_value(folio)) folio_put(folio); - *fault_type =3D handle_userfault(vmf, VM_UFFD_MINOR); + *fault_type =3D VM_FAULT_UFFD_MINOR; return 0; } =20 --=20 2.51.0 From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 81D2D36E49A; Tue, 27 Jan 2026 19:31:09 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542269; cv=none; b=LxTXy+0XqLmMMvTNzOMjgqECiWxHdZWQ+uiU4VRL/yOJrjR1lqSTp5SqULl/JNThPp9O5Ef+eM///Bhp2fZi8SmyLRAZOEZ2nkGtDcVvMiI29vUi2rX414nf5vWO+nVHZjt/iTK9TlU8n/ZKm4vWBPxU3zskRjse8+AMVo2YO/M= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542269; c=relaxed/simple; bh=+RlyxhG8wPwYaWoZfV3wHqPdgp7OwXA/s55MoAiNhJU=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=syT8WJwSamsFt+MgIVNNL2Ogu/2Z1/ZjMTV8qz0tx44qkqehcvzku7NuG2d54pQ37oaLVlESc94T0Ia3PVyVCxlJytNSWVajdpBQ2iFnK+omsv2GCwhSetvrORqx5RQg5Ek6q+X0zo5gA/18aYSCyxMuuHoFiwGYtgyCxS1gHok= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=hxCun7NY; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="hxCun7NY" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 72991C116C6; Tue, 27 Jan 2026 19:31:03 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542269; bh=+RlyxhG8wPwYaWoZfV3wHqPdgp7OwXA/s55MoAiNhJU=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=hxCun7NYeayN6IkquZNPDK6UUnd1cAbljA2eL0BVX9w58wypp6/mCfqTEe8bHxP5a HG4OouTbgYoMhHWfngWLGr5c2/z/XbLR5ETtxzfmFLGLns51cPfDLBrvvSUx18cp4e ndfN0qt9j/sfkyHEqyjwx8YVt5nPQE3T389NiQ0VWC92VxIiaSBx8/e+63FoGVxIWZ 4Zcs+7dyUpegnBUsSn5MbvHtNdkIrtfxEYCtVqLblH1y7e4cMmySJc/k11QIs8g/Uh WmM/cmFVYJd2vqUFcJ9yZ5uiwJ7k1/o/TOfpz73DmNLM4l61/9gwO/XPQZdiZ3f8kT v+fkzz1g90MJQ== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH RFC 13/17] mm: introduce VM_FAULT_UFFD_MISSING fault reason Date: Tue, 27 Jan 2026 21:29:32 +0200 Message-ID: <20260127192936.1250096-14-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Nikita Kalyazin When a VMA is registered with userfaulfd in missing mode, its ->fault() method should check if a folio exists in the page cache and if no ->fault() should call handle_userfault(VM_UFFD_MISSING). Instead of calling handle_userfault() directly from a specific ->fault() implementation introduce new fault reason VM_FAULT_UFFD_MISSING that will notify the core page fault handler that it should call handle_userfaultfd(VM_UFFD_MISSING) to complete a page fault. Replace a call to handle_userfault(VM_UFFD_MISSING) in shmem and use the new VM_FAULT_UFFD_MISSING there instead. For configurations that don't enable CONFIG_USERFAULTFD, VM_FAULT_UFFD_MISSING is set to 0. Signed-off-by: Nikita Kalyazin Signed-off-by: Mike Rapoport (Microsoft) --- include/linux/mm_types.h | 7 ++++++- mm/memory.c | 5 ++++- mm/shmem.c | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b25ac322bfbf..a061c43e835b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1557,6 +1557,8 @@ typedef __bitwise unsigned int vm_fault_t; * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released * @VM_FAULT_UFFD_MINOR: ->fault did not modify page tables and needs * handle_userfault(VM_UFFD_MINOR) to complete + * @VM_FAULT_UFFD_MISSING: ->fault did not modify page tables and needs + * handle_userfault(VM_UFFD_MISSING) to complete * @VM_FAULT_HINDEX_MASK: mask HINDEX value * */ @@ -1576,8 +1578,10 @@ enum vm_fault_reason { VM_FAULT_COMPLETED =3D (__force vm_fault_t)0x004000, #ifdef CONFIG_USERFAULTFD VM_FAULT_UFFD_MINOR =3D (__force vm_fault_t)0x008000, + VM_FAULT_UFFD_MISSING =3D (__force vm_fault_t)0x010000, #else VM_FAULT_UFFD_MINOR =3D (__force vm_fault_t)0x000000, + VM_FAULT_UFFD_MISSING =3D (__force vm_fault_t)0x000000, #endif VM_FAULT_HINDEX_MASK =3D (__force vm_fault_t)0x0f0000, }; @@ -1604,7 +1608,8 @@ enum vm_fault_reason { { VM_FAULT_DONE_COW, "DONE_COW" }, \ { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \ { VM_FAULT_COMPLETED, "COMPLETED" }, \ - { VM_FAULT_UFFD_MINOR, "UFFD_MINOR" } + { VM_FAULT_UFFD_MINOR, "UFFD_MINOR" }, \ + { VM_FAULT_UFFD_MISSING, "UFFD_MISSING" } =20 struct vm_special_mapping { const char *name; /* The name, e.g. "[vdso]". */ diff --git a/mm/memory.c b/mm/memory.c index fcb3e0c3113e..f72e69a43b68 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5319,9 +5319,12 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) =20 ret =3D vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | - VM_FAULT_DONE_COW | VM_FAULT_UFFD_MINOR))) { + VM_FAULT_DONE_COW | VM_FAULT_UFFD_MINOR | + VM_FAULT_UFFD_MISSING))) { if (ret & VM_FAULT_UFFD_MINOR) return handle_userfault(vmf, VM_UFFD_MINOR); + if (ret & VM_FAULT_UFFD_MISSING) + return handle_userfault(vmf, VM_UFFD_MISSING); return ret; } =20 diff --git a/mm/shmem.c b/mm/shmem.c index 6aa905147c0c..1bc544cab2a8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2530,7 +2530,7 @@ static int shmem_get_folio_gfp(struct inode *inode, p= goff_t index, */ =20 if (vma && userfaultfd_missing(vma)) { - *fault_type =3D handle_userfault(vmf, VM_UFFD_MISSING); + *fault_type =3D VM_FAULT_UFFD_MISSING; return 0; } =20 --=20 2.51.0 From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id DADED36EA80; Tue, 27 Jan 2026 19:31:15 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542275; cv=none; b=eB6D3a3DSh0S0RdKVLqF8QEpOnwsRRNg6xY5PYg0N5Xx+MKDIjvP15rckwhdwoDm/ccXsiy2LnTDr/o1wr9dX8Z7REjlPavh6+YvwKY9U2V0H8yiZGeYqsTz3i8nwWHDil8zl1cmZzVg0BLwdBidPLbdQsYxxUSZE1Tt0o28x7s= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542275; c=relaxed/simple; bh=mw7CGKFvx9FMe3BW36n9yIoK6eLgZ7r0YBQzEf/E1b0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=LLnnt3WpzGi1Lx/6SSKFAjtXSQyTcF+xWAYjxUM0XBbF2NlujD6ESTySlNz/uZ5xouG9cJuw8ZYZJ8PyGJJIzp89NKQsSU0lq30lVjSzzahW01bjTFXj+T8SyiR2SHMc34rrSPZL4Z44rDm9Q3sinfdH6wGfKZoN0jT9bQpUDoQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=aL8Ud7ZH; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="aL8Ud7ZH" Received: by smtp.kernel.org (Postfix) with ESMTPSA id B13DFC19425; Tue, 27 Jan 2026 19:31:09 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542275; bh=mw7CGKFvx9FMe3BW36n9yIoK6eLgZ7r0YBQzEf/E1b0=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=aL8Ud7ZHfQ4ni3W9TzUjcO4neGENPtoWsNsKBI7VRTQuCI23PUjuSBLAZ1aN1GyaB vFN7k/Vu9D2wGsSu8iQWIGu2ynUywvwZFTHrPKEAjb3d/c0voxyjrRPlwDWRKHN2yJ q/nh3xFx76c2uTqJYr3FUFEnTkXgqPngXmCl2odtLwn0tntuQy9klzz3kcXtDXEnFt 752XLRVFkzKgxxE2Gjwi+WxKUqKgsUihtNV9KgWPbUFok1BDtVxsYfgH+MAQgTu2BA Ye9tzP0J8YOhcuP03PhQP4JI42fQP5Zyv+8lZy9B9agBHAOs0e++FT3hToTRTMprNI WwiZAnbQka6Lg== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH RFC 14/17] KVM: guest_memfd: implement userfaultfd minor mode Date: Tue, 27 Jan 2026 21:29:33 +0200 Message-ID: <20260127192936.1250096-15-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Nikita Kalyazin userfaultfd notifications about minor page faults used for live migration and snapshotting of VMs with memory backed by shared hugetlbfs or tmpfs mappings as described in detail in commit 7677f7fd8be7 ("userfaultfd: add minor fault registration mode"). To use the same mechanism for VMs that use guest_memfd to map their memory, guest_memfd should support userfaultfd minor mode. Extend ->fault() method of guest_memfd with ability to notify core page fault handler that a page fault requires handle_userfault(VM_UFFD_MINOR) to complete and add vm_uffd_ops to guest_memfd vm_ops with implementation of ->can_userfault() and ->get_folio_noalloc() methods. Signed-off-by: Nikita Kalyazin Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) --- virt/kvm/guest_memfd.c | 76 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 65 insertions(+), 11 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index fdaea3422c30..087e7632bf70 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -7,6 +7,7 @@ #include #include #include +#include =20 #include "kvm_mm.h" =20 @@ -121,6 +122,26 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, str= uct kvm_memory_slot *slot, return r; } =20 +static struct folio *kvm_gmem_get_folio_noalloc(struct inode *inode, pgoff= _t pgoff) +{ + return __filemap_get_folio(inode->i_mapping, pgoff, + FGP_LOCK | FGP_ACCESSED, 0); +} + +static struct folio *__kvm_gmem_folio_alloc(struct inode *inode, pgoff_t i= ndex) +{ + struct mempolicy *policy; + struct folio *folio; + + policy =3D mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index); + folio =3D __filemap_get_folio_mpol(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(inode->i_mapping), policy); + mpol_cond_put(policy); + + return folio; +} + /* * Returns a locked folio on success. The caller is responsible for * setting the up-to-date flag before the memory is mapped into the guest. @@ -133,25 +154,17 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, st= ruct kvm_memory_slot *slot, static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) { /* TODO: Support huge pages. */ - struct mempolicy *policy; struct folio *folio; =20 /* * Fast-path: See if folio is already present in mapping to avoid * policy_lookup. */ - folio =3D __filemap_get_folio(inode->i_mapping, index, - FGP_LOCK | FGP_ACCESSED, 0); + folio =3D kvm_gmem_get_folio_noalloc(inode, index); if (!IS_ERR(folio)) return folio; =20 - policy =3D mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index); - folio =3D __filemap_get_folio_mpol(inode->i_mapping, index, - FGP_LOCK | FGP_ACCESSED | FGP_CREAT, - mapping_gfp_mask(inode->i_mapping), policy); - mpol_cond_put(policy); - - return folio; + return __kvm_gmem_folio_alloc(inode, index); } =20 static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct ino= de *inode) @@ -405,7 +418,24 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct v= m_fault *vmf) if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)) return VM_FAULT_SIGBUS; =20 - folio =3D kvm_gmem_get_folio(inode, vmf->pgoff); + folio =3D __filemap_get_folio(inode->i_mapping, vmf->pgoff, + FGP_LOCK | FGP_ACCESSED, 0); + + if (userfaultfd_armed(vmf->vma)) { + /* + * If userfaultfd is registered in minor mode and a folio + * exists, return VM_FAULT_UFFD_MINOR to trigger the + * userfaultfd handler. + */ + if (userfaultfd_minor(vmf->vma) && !IS_ERR_OR_NULL(folio)) { + ret =3D VM_FAULT_UFFD_MINOR; + goto out_folio; + } + } + + /* folio not in the pagecache, try to allocate */ + if (IS_ERR(folio)) + folio =3D __kvm_gmem_folio_alloc(inode, vmf->pgoff); if (IS_ERR(folio)) { if (PTR_ERR(folio) =3D=3D -EAGAIN) return VM_FAULT_RETRY; @@ -462,12 +492,36 @@ static struct mempolicy *kvm_gmem_get_policy(struct v= m_area_struct *vma, } #endif /* CONFIG_NUMA */ =20 +#ifdef CONFIG_USERFAULTFD +static bool kvm_gmem_can_userfault(struct vm_area_struct *vma, vm_flags_t = vm_flags) +{ + struct inode *inode =3D file_inode(vma->vm_file); + + /* + * Only support userfaultfd for guest_memfd with INIT_SHARED flag. + * This ensures the memory can be mapped to userspace. + */ + if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)) + return false; + + return true; +} + +static const struct vm_uffd_ops kvm_gmem_uffd_ops =3D { + .can_userfault =3D kvm_gmem_can_userfault, + .get_folio_noalloc =3D kvm_gmem_get_folio_noalloc, +}; +#endif /* CONFIG_USERFAULTFD */ + static const struct vm_operations_struct kvm_gmem_vm_ops =3D { .fault =3D kvm_gmem_fault_user_mapping, #ifdef CONFIG_NUMA .get_policy =3D kvm_gmem_get_policy, .set_policy =3D kvm_gmem_set_policy, #endif +#ifdef CONFIG_USERFAULTFD + .uffd_ops =3D &kvm_gmem_uffd_ops, +#endif }; =20 static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) --=20 2.51.0 From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 204FD299922; Tue, 27 Jan 2026 19:31:21 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542282; cv=none; b=U0TGe2UXb+XkcVL3GB3wN0Fh+cGKfXcsGYsmUUxl65ksqVMR6zaxnXGcSwZ5fp+NXTCT4ZUtyQmD5FpRfSjNvklAWaPGoU3ctTjgVq8+f9w0DT1XvMFNhU2yuIGYno4rOLfUirPZg5ryZg9aPWDIpiYfopBVWF74W1Zlt3/pGxo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542282; c=relaxed/simple; bh=96EEwJcWySxO2R/2KP/gCzA6SPsvcN9sMFGfwSmvv8I=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=BKjA+sougm+gJbsZCK6etP+iGD3Ngewf0FtwrVxw1r0anOrcjVuLi07GHZ7rSf+APNviGmtSb+jLa4H1DtQSroUexk/cX1xNAqcukuPc01rlVAHH0BxGhI3umD9VuwtQnlOINASiVh7jOXEgtYHhW8nyiAM2q7bhIS+XX5+MGH4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=ZW1wjmlE; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="ZW1wjmlE" Received: by smtp.kernel.org (Postfix) with ESMTPSA id EF23CC116C6; Tue, 27 Jan 2026 19:31:15 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542281; bh=96EEwJcWySxO2R/2KP/gCzA6SPsvcN9sMFGfwSmvv8I=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=ZW1wjmlEaWOSdy5FnqJJ6HOmgWV7pu0s4MRk84xlIeo88QjzOqjdIYlrG7ljWeU0v MeVT/1r5gvKQ8hwIT2RBtV0wKCMd8ov24qkrjcRy4Vfh/Y21OxTvvwPWedf28sTtpo suFN5ZKjXWM1NKIUy+6Dpr+0RTJ2rjrKbQS/lNK/SjFllGnjxzucAuckujiOyB+4U9 ccJBVk/3++oHxXEHSESsxUT5xJTAVOyEPse6JLGhtLLEhZZ/XVVi8U9wBHMb6IXpTH /5+0b26eHzpI8qMV5AnhQOxvJp1yjrHIWrURA/5Ndiprz3rlpr6hJnVNc2kD2LXCFh tOhezlpGaLTpQ== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH RFC 15/17] KVM: guest_memfd: implement userfaultfd missing mode Date: Tue, 27 Jan 2026 21:29:34 +0200 Message-ID: <20260127192936.1250096-16-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Nikita Kalyazin userfaultfd missing mode allows populating guest memory with the content supplied by userspace on demand. Extend guest_memfd implementation of vm_uffd_ops to support MISSING mode. Signed-off-by: Nikita Kalyazin Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) --- virt/kvm/guest_memfd.c | 60 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 087e7632bf70..14cca057fc0e 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -431,6 +431,14 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct v= m_fault *vmf) ret =3D VM_FAULT_UFFD_MINOR; goto out_folio; } + + /* + * Check if userfaultfd is registered in missing mode. If so, + * check if a folio exists in the page cache. If not, return + * VM_FAULT_UFFD_MISSING to trigger the userfaultfd handler. + */ + if (userfaultfd_missing(vmf->vma) && IS_ERR_OR_NULL(folio)) + return VM_FAULT_UFFD_MISSING; } =20 /* folio not in the pagecache, try to allocate */ @@ -507,9 +515,59 @@ static bool kvm_gmem_can_userfault(struct vm_area_stru= ct *vma, vm_flags_t vm_fla return true; } =20 +static struct folio *kvm_gmem_folio_alloc(struct vm_area_struct *vma, + unsigned long addr) +{ + struct inode *inode =3D file_inode(vma->vm_file); + pgoff_t pgoff =3D linear_page_index(vma, addr); + struct mempolicy *mpol; + struct folio *folio; + gfp_t gfp; + + if (unlikely(pgoff >=3D (i_size_read(inode) >> PAGE_SHIFT))) + return NULL; + + gfp =3D mapping_gfp_mask(inode->i_mapping); + mpol =3D mpol_shared_policy_lookup(&GMEM_I(inode)->policy, pgoff); + mpol =3D mpol ?: get_task_policy(current); + folio =3D folio_alloc_mpol(gfp, 0, mpol, pgoff, numa_node_id()); + mpol_cond_put(mpol); + + return folio; +} + +static int kvm_gmem_filemap_add(struct folio *folio, + struct vm_area_struct *vma, + unsigned long addr) +{ + struct inode *inode =3D file_inode(vma->vm_file); + struct address_space *mapping =3D inode->i_mapping; + pgoff_t pgoff =3D linear_page_index(vma, addr); + int err; + + __folio_set_locked(folio); + err =3D filemap_add_folio(mapping, folio, pgoff, GFP_KERNEL); + if (err) { + folio_unlock(folio); + return err; + } + + return 0; +} + +static void kvm_gmem_filemap_remove(struct folio *folio, + struct vm_area_struct *vma) +{ + filemap_remove_folio(folio); + folio_unlock(folio); +} + static const struct vm_uffd_ops kvm_gmem_uffd_ops =3D { - .can_userfault =3D kvm_gmem_can_userfault, + .can_userfault =3D kvm_gmem_can_userfault, .get_folio_noalloc =3D kvm_gmem_get_folio_noalloc, + .alloc_folio =3D kvm_gmem_folio_alloc, + .filemap_add =3D kvm_gmem_filemap_add, + .filemap_remove =3D kvm_gmem_filemap_remove, }; #endif /* CONFIG_USERFAULTFD */ =20 --=20 2.51.0 From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0FAF5299922; Tue, 27 Jan 2026 19:31:28 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542288; cv=none; b=skzLxXD32pYe8ViaBsVHKtMDsg/QrpsTUGnQA7oSnW5TUO43bxoAfXOuin/YcwrJHLjodgiDgBw0f3nWT9HIWpH2P1vopByvUyZPaRPQOsGceIJ2zhuGgaPIlI21pinqbWqd53vtJeywWZFhrqPVBUO1GoGaNbJVkfhUuPet4c4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542288; c=relaxed/simple; bh=rG3XSy87LBPK/+pmM3kvRr3VcKC4P6b5lnQZUhRBOEs=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=LCN8YF3NAeVxyb6+zpPSxeTeAjIDEWcdnPUmkHngtZrrm9vw3K2rVh+sZGk4zEZMPOv7CSLCCpX7pz1luaGZTaGQbXEXapHW9ZLoe+ClDU/uo5xW1aGdwhZNKwqbkY4uh8fC+7B8rbgclY7BAnaLC8OqFPAwjFzi/Vwmo88r6x4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=RB3nfRHp; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="RB3nfRHp" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 39806C19422; Tue, 27 Jan 2026 19:31:22 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542287; bh=rG3XSy87LBPK/+pmM3kvRr3VcKC4P6b5lnQZUhRBOEs=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=RB3nfRHpGiNvOqxgpNiZzo3iMscL/p4GfrLZS4edKXp9deTKvngWQ5yIEGQy0e/5F O11XOAS9zwA6WwsoT5yGilgWcmyGjht22jVBfE4D1zPSTO4Lh29c0kFGZKshLmP6KU PjQLJdYQfcKIxrcRWATTIx/t8rhoV5ZQlgVt047UW/54jrp0GBag8tfnI1+HZy+7NB nkhOJ+iWIM8WROA3H4dahP0y9fxV4wT8efsNK6Viw+ggJHFqJrxDQq2fYuWUuN2X+S sHd7xyeTtYygAw8GtnHJxxga0bCx/Vc+eS/AblKFNnTFaa/PSOraMhjSU3JAzWMVbi VPiCBLnWztFEg== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH RFC 16/17] KVM: selftests: test userfaultfd minor for guest_memfd Date: Tue, 27 Jan 2026 21:29:35 +0200 Message-ID: <20260127192936.1250096-17-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Nikita Kalyazin The test demonstrates that a minor userfaultfd event in guest_memfd can be resolved via a memcpy followed by a UFFDIO_CONTINUE ioctl. Signed-off-by: Nikita Kalyazin Signed-off-by: Mike Rapoport (Microsoft) --- .../testing/selftests/kvm/guest_memfd_test.c | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing= /selftests/kvm/guest_memfd_test.c index 618c937f3c90..7612819e340a 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -10,13 +10,17 @@ #include #include #include +#include =20 #include #include #include +#include #include #include #include +#include +#include =20 #include "kvm_util.h" #include "numaif.h" @@ -329,6 +333,112 @@ static void test_create_guest_memfd_multiple(struct k= vm_vm *vm) close(fd1); } =20 +struct fault_args { + char *addr; + char value; +}; + +static void *fault_thread_fn(void *arg) +{ + struct fault_args *args =3D arg; + + /* Trigger page fault */ + args->value =3D *args->addr; + return NULL; +} + +static void test_uffd_minor(int fd, size_t total_size) +{ + struct uffdio_register uffd_reg; + struct uffdio_continue uffd_cont; + struct uffd_msg msg; + struct fault_args args; + pthread_t fault_thread; + void *mem, *mem_nofault, *buf =3D NULL; + int uffd, ret; + off_t offset =3D page_size; + void *fault_addr; + const char test_val =3D 0xcd; + + ret =3D posix_memalign(&buf, page_size, total_size); + TEST_ASSERT_EQ(ret, 0); + memset(buf, test_val, total_size); + + uffd =3D syscall(__NR_userfaultfd, O_CLOEXEC); + TEST_ASSERT(uffd !=3D -1, "userfaultfd creation should succeed"); + + struct uffdio_api uffdio_api =3D { + .api =3D UFFD_API, + .features =3D 0, + }; + ret =3D ioctl(uffd, UFFDIO_API, &uffdio_api); + TEST_ASSERT(ret !=3D -1, "ioctl(UFFDIO_API) should succeed"); + + /* Map the guest_memfd twice: once with UFFD registered, once without */ + mem =3D mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + TEST_ASSERT(mem !=3D MAP_FAILED, "mmap should succeed"); + + mem_nofault =3D mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED= , fd, 0); + TEST_ASSERT(mem_nofault !=3D MAP_FAILED, "mmap should succeed"); + + /* Register UFFD_MINOR on the first mapping */ + uffd_reg.range.start =3D (unsigned long)mem; + uffd_reg.range.len =3D total_size; + uffd_reg.mode =3D UFFDIO_REGISTER_MODE_MINOR; + ret =3D ioctl(uffd, UFFDIO_REGISTER, &uffd_reg); + TEST_ASSERT(ret !=3D -1, "ioctl(UFFDIO_REGISTER) should succeed"); + + /* + * Populate the page in the page cache first via mem_nofault. + * This is required for UFFD_MINOR - the page must exist in the cache. + * Write test data to the page. + */ + memcpy(mem_nofault + offset, buf + offset, page_size); + + /* + * Now access the same page via mem (which has UFFD_MINOR registered). + * Since the page exists in the cache, this should trigger UFFD_MINOR. + */ + fault_addr =3D mem + offset; + args.addr =3D fault_addr; + + ret =3D pthread_create(&fault_thread, NULL, fault_thread_fn, &args); + TEST_ASSERT(ret =3D=3D 0, "pthread_create should succeed"); + + ret =3D read(uffd, &msg, sizeof(msg)); + TEST_ASSERT(ret !=3D -1, "read from userfaultfd should succeed"); + TEST_ASSERT(msg.event =3D=3D UFFD_EVENT_PAGEFAULT, "event type should be = pagefault"); + TEST_ASSERT((void *)(msg.arg.pagefault.address & ~(page_size - 1)) =3D=3D= fault_addr, + "pagefault should occur at expected address"); + TEST_ASSERT(msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR, + "pagefault should be minor fault"); + + /* Resolve the minor fault with UFFDIO_CONTINUE */ + uffd_cont.range.start =3D (unsigned long)fault_addr; + uffd_cont.range.len =3D page_size; + uffd_cont.mode =3D 0; + ret =3D ioctl(uffd, UFFDIO_CONTINUE, &uffd_cont); + TEST_ASSERT(ret !=3D -1, "ioctl(UFFDIO_CONTINUE) should succeed"); + + /* Wait for the faulting thread to complete */ + ret =3D pthread_join(fault_thread, NULL); + TEST_ASSERT(ret =3D=3D 0, "pthread_join should succeed"); + + /* Verify the thread read the correct value */ + TEST_ASSERT(args.value =3D=3D test_val, + "memory should contain the value that was written"); + TEST_ASSERT(*(char *)(mem + offset) =3D=3D test_val, + "no further fault is expected"); + + ret =3D munmap(mem_nofault, total_size); + TEST_ASSERT(!ret, "munmap should succeed"); + + ret =3D munmap(mem, total_size); + TEST_ASSERT(!ret, "munmap should succeed"); + free(buf); + close(uffd); +} + static void test_guest_memfd_flags(struct kvm_vm *vm) { uint64_t valid_flags =3D vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS); @@ -383,6 +493,9 @@ static void __test_guest_memfd(struct kvm_vm *vm, uint6= 4_t flags) gmem_test(file_size, vm, flags); gmem_test(fallocate, vm, flags); gmem_test(invalid_punch_hole, vm, flags); + + if (flags & GUEST_MEMFD_FLAG_INIT_SHARED) + gmem_test(uffd_minor, vm, flags); } =20 static void test_guest_memfd(unsigned long vm_type) --=20 2.51.0 From nobody Sat Feb 7 15:40:03 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 56B2936EA98; Tue, 27 Jan 2026 19:31:34 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542294; cv=none; b=DE8HjDDmNTVK71HM51sFYSH56jss0gVwEtR+b6cEf7QkZYFInirq34VPlYlCcdRPlrgPmMrJn7bV/NqzcvdaaCPMeSoMio1X1mRWVC6yvtIzxRj30EtzwMUOhp5yegBf/f6TN9wR+BOojfuOVEFL6VTqyFYul9DWMK6BqZBlonI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1769542294; c=relaxed/simple; bh=Z7I33Hc39huXt1KMAEv+ORYc+prBLTGLG9GW3dkBprM=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=korIdqBcwmri/+I5K0zI/hmlDhmgo/WWFwb2YCaPZFjccvBN3LH1Dmk7/K7yEuEXDv8Z3djgQPdtRBj82ba1QyITOClGF7BAuwMUZT5m6J9aZvrnc5wnBsnDQ/dwx0B8LhZrbyYJWEWJYK2Wqbx0OO/AeYijEovOOwQyTe1zzfY= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=ER3jWOY0; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="ER3jWOY0" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 773CEC19425; Tue, 27 Jan 2026 19:31:28 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1769542294; bh=Z7I33Hc39huXt1KMAEv+ORYc+prBLTGLG9GW3dkBprM=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=ER3jWOY07iJPn8qyjNDfRFyRgYxyWuJjiSMp41SKhB+56SWGzeoPq6P1kW90+cWm2 QTgYAemvCk1w7t5MUfY/l0AkuHfpYBmxwUxPvAWGQI5l5EHWj6aTOFV/Mchq0bEU1C GOG2+qDG6uH2kXDzBlf4UO9UjRz8htNgAuMPwMN0aahfUJeQTxTlfEXYNXD30QDooW GIQamzOkF/J2WRu+6WATk/bFSXVaEhzYGRMmf6YewptxAR1UuZgiZRX7f8snoWqCcS hqMRCqk877/SG5QpuI75iQgPpzMJR/b+46TGplSy9dBIwKC6NM543/hW0GLCkk9KJW IoFmt7fkcyLwQ== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Muchun Song , Nikita Kalyazin , Oscar Salvador , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH RFC 17/17] KVM: selftests: test userfaultfd missing for guest_memfd Date: Tue, 27 Jan 2026 21:29:36 +0200 Message-ID: <20260127192936.1250096-18-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260127192936.1250096-1-rppt@kernel.org> References: <20260127192936.1250096-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Nikita Kalyazin The test demonstrates that a missing userfaultfd event in guest_memfd can be resolved via a UFFDIO_COPY ioctl. Signed-off-by: Nikita Kalyazin Signed-off-by: Mike Rapoport (Microsoft) --- .../testing/selftests/kvm/guest_memfd_test.c | 80 ++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing= /selftests/kvm/guest_memfd_test.c index 7612819e340a..f77e70d22175 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -439,6 +439,82 @@ static void test_uffd_minor(int fd, size_t total_size) close(uffd); } =20 +static void test_uffd_missing(int fd, size_t total_size) +{ + struct uffdio_register uffd_reg; + struct uffdio_copy uffd_copy; + struct uffd_msg msg; + struct fault_args args; + pthread_t fault_thread; + void *mem, *buf =3D NULL; + int uffd, ret; + off_t offset =3D page_size; + void *fault_addr; + const char test_val =3D 0xab; + + ret =3D posix_memalign(&buf, page_size, total_size); + TEST_ASSERT_EQ(ret, 0); + memset(buf, test_val, total_size); + + uffd =3D syscall(__NR_userfaultfd, O_CLOEXEC); + TEST_ASSERT(uffd !=3D -1, "userfaultfd creation should succeed"); + + struct uffdio_api uffdio_api =3D { + .api =3D UFFD_API, + .features =3D 0, + }; + ret =3D ioctl(uffd, UFFDIO_API, &uffdio_api); + TEST_ASSERT(ret !=3D -1, "ioctl(UFFDIO_API) should succeed"); + + mem =3D mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + TEST_ASSERT(mem !=3D MAP_FAILED, "mmap should succeed"); + + uffd_reg.range.start =3D (unsigned long)mem; + uffd_reg.range.len =3D total_size; + uffd_reg.mode =3D UFFDIO_REGISTER_MODE_MISSING; + ret =3D ioctl(uffd, UFFDIO_REGISTER, &uffd_reg); + TEST_ASSERT(ret !=3D -1, "ioctl(UFFDIO_REGISTER) should succeed"); + + fault_addr =3D mem + offset; + args.addr =3D fault_addr; + + ret =3D pthread_create(&fault_thread, NULL, fault_thread_fn, &args); + TEST_ASSERT(ret =3D=3D 0, "pthread_create should succeed"); + + ret =3D read(uffd, &msg, sizeof(msg)); + TEST_ASSERT(ret !=3D -1, "read from userfaultfd should succeed"); + TEST_ASSERT(msg.event =3D=3D UFFD_EVENT_PAGEFAULT, "event type should be = pagefault"); + TEST_ASSERT((void *)(msg.arg.pagefault.address & ~(page_size - 1)) =3D=3D= fault_addr, + "pagefault should occur at expected address"); + TEST_ASSERT(!(msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP), + "pagefault should not be write-protect"); + + uffd_copy.dst =3D (unsigned long)fault_addr; + uffd_copy.src =3D (unsigned long)(buf + offset); + uffd_copy.len =3D page_size; + uffd_copy.mode =3D 0; + ret =3D ioctl(uffd, UFFDIO_COPY, &uffd_copy); + TEST_ASSERT(ret !=3D -1, "ioctl(UFFDIO_COPY) should succeed"); + + /* Wait for the faulting thread to complete - this provides the memory ba= rrier */ + ret =3D pthread_join(fault_thread, NULL); + TEST_ASSERT(ret =3D=3D 0, "pthread_join should succeed"); + + /* + * Now it's safe to check args.value - the thread has completed + * and memory is synchronized + */ + TEST_ASSERT(args.value =3D=3D test_val, + "memory should contain the value that was copied"); + TEST_ASSERT(*(char *)(mem + offset) =3D=3D test_val, + "no further fault is expected"); + + ret =3D munmap(mem, total_size); + TEST_ASSERT(!ret, "munmap should succeed"); + free(buf); + close(uffd); +} + static void test_guest_memfd_flags(struct kvm_vm *vm) { uint64_t valid_flags =3D vm_check_cap(vm, KVM_CAP_GUEST_MEMFD_FLAGS); @@ -494,8 +570,10 @@ static void __test_guest_memfd(struct kvm_vm *vm, uint= 64_t flags) gmem_test(fallocate, vm, flags); gmem_test(invalid_punch_hole, vm, flags); =20 - if (flags & GUEST_MEMFD_FLAG_INIT_SHARED) + if (flags & GUEST_MEMFD_FLAG_INIT_SHARED) { gmem_test(uffd_minor, vm, flags); + gmem_test(uffd_missing, vm, flags); + } } =20 static void test_guest_memfd(unsigned long vm_type) --=20 2.51.0