From nobody Mon Dec 1 22:06:15 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3016222A7F9; Sun, 30 Nov 2025 11:18:26 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1764501507; cv=none; b=CmsSTRCal6BHGQ0lgWZ4sRisxAD45r8RPPECm0kGLCsG8VPabpRbeIlWe2wyZloIifJ3W3WBBJvEqxrugpKg8BZfuBSgxttwSGi/9B7bco1ghAvOm/NKi6RmRLeSOP4VJQzHSkHMNuqyG1P+SZfmDL1IUnXXrGlyQ4YGn7xjc7M= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1764501507; c=relaxed/simple; bh=M3yfj9Aq9Ecb/VyGmdsiKpQj2lliUSutXUvuxf84MWc=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=FD9Xga0BMWJwZowQPKi/IDGNGjntA5dmkqwTmjb942s2LE5fSYUZoPtEpIvbU3pzyNGJktHAeHXo69utVbboQuJ0fbhkdbDrTp1U7Lhfv3U1pPToWbv7Vq82+w5BnnrS50ILsBI6rJ/jqXbrXU081x9QqIeWUmGCFdi3Ly6Ti/I= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=u22qDJe9; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="u22qDJe9" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 62A53C116B1; Sun, 30 Nov 2025 11:18:21 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1764501506; bh=M3yfj9Aq9Ecb/VyGmdsiKpQj2lliUSutXUvuxf84MWc=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=u22qDJe9yyfE9b0SEkH5kKwgmFUHYDDFq+6wlb/PHEFZTcyJQHuwm7/PK3YmJmuig I/OaxNv/MX4j/UfdzT2pvMWImwdJFoo4+5PBgAV9kLt/bYyFRR8QEGiqmcBN5O8YnQ phtYcipFli54J+Kax26kfnNNeXL46lCU5aDEP+ZzNsQE96TzGu42rs6agH4zRymw/Y r0lu7M64VcpNcVq0wtXsraGDQk65Zt3/IrNhO19KlMjOHkpWsB/anS1B8UA5Yu1d1s zjhHAYN8Dreo7ziAunpNhRalMI8xuAzdN0zUQtvVx/KvdMEg0kpKB7fS2lIfOdXb0H OFmfIYFu2v0LA== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Nikita Kalyazin , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org, "David Hildenbrand (Red Hat)" Subject: [PATCH v3 1/5] userfaultfd: move vma_can_userfault out of line Date: Sun, 30 Nov 2025 13:18:08 +0200 Message-ID: <20251130111812.699259-2-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251130111812.699259-1-rppt@kernel.org> References: <20251130111812.699259-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Mike Rapoport (Microsoft)" vma_can_userfault() has grown pretty big and it's not called on performance critical path. Move it out of line. No functional changes. Reviewed-by: David Hildenbrand (Red Hat) Reviewed-by: Liam R. Howlett Signed-off-by: Mike Rapoport (Microsoft) --- include/linux/userfaultfd_k.h | 36 ++--------------------------------- mm/userfaultfd.c | 34 +++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 34 deletions(-) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index c0e716aec26a..e4f43e7b063f 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -208,40 +208,8 @@ static inline bool userfaultfd_armed(struct vm_area_st= ruct *vma) return vma->vm_flags & __VM_UFFD_FLAGS; } =20 -static inline bool vma_can_userfault(struct vm_area_struct *vma, - vm_flags_t vm_flags, - bool wp_async) -{ - vm_flags &=3D __VM_UFFD_FLAGS; - - if (vma->vm_flags & VM_DROPPABLE) - return false; - - if ((vm_flags & VM_UFFD_MINOR) && - (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) - return false; - - /* - * If wp async enabled, and WP is the only mode enabled, allow any - * memory type. - */ - if (wp_async && (vm_flags =3D=3D VM_UFFD_WP)) - return true; - -#ifndef CONFIG_PTE_MARKER_UFFD_WP - /* - * If user requested uffd-wp but not enabled pte markers for - * uffd-wp, then shmem & hugetlbfs are not supported but only - * anonymous. - */ - if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) - return false; -#endif - - /* By default, allow any of anon|shmem|hugetlb */ - return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || - vma_is_shmem(vma); -} +bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, + bool wp_async); =20 static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct = *vma) { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index af61b95c89e4..8dc964389b0d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1977,6 +1977,40 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsi= gned long dst_start, return moved ? moved : err; } =20 +bool vma_can_userfault(struct vm_area_struct *vma, vm_flags_t vm_flags, + bool wp_async) +{ + vm_flags &=3D __VM_UFFD_FLAGS; + + if (vma->vm_flags & VM_DROPPABLE) + return false; + + if ((vm_flags & VM_UFFD_MINOR) && + (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) + return false; + + /* + * If wp async enabled, and WP is the only mode enabled, allow any + * memory type. + */ + if (wp_async && (vm_flags =3D=3D VM_UFFD_WP)) + return true; + +#ifndef CONFIG_PTE_MARKER_UFFD_WP + /* + * If user requested uffd-wp but not enabled pte markers for + * uffd-wp, then shmem & hugetlbfs are not supported but only + * anonymous. + */ + if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) + return false; +#endif + + /* By default, allow any of anon|shmem|hugetlb */ + return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || + vma_is_shmem(vma); +} + static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, vm_flags_t vm_flags) { --=20 2.51.0 From nobody Mon Dec 1 22:06:15 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5EF9D27A10F; Sun, 30 Nov 2025 11:18:33 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1764501513; cv=none; b=AjWXmjILljlBFQZaCp4/XAZOuB2+z57/LGzWOOIEPvzrXs6N/e9USCfTcybTbRw7bQj3wZVjMIMd0OKtGatw7GLb+2pw4ZuXhZKunOLeTQT9GPe4dGBDQELckcHgnN2UmCJepTRwg1XHYXlepiOUNYgZXRBwx5MCpJNj/1VKGGI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1764501513; c=relaxed/simple; bh=No4OV/v0jQLoJl8PKO7GG/i2wSSVo7iZmz3MRUSzNrc=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=ojiMLhQjOAG0Z4lnO/CpzAGEIdjWLGHvg+iLRJVecjeBk9121s3E1DvK8k37GO8WlXklYj3iplZ+JWgM3HNfECgMMrQGMBb9DdKVfw48v+Dm4MjyY/WyaUpLX3uablgwBI6nEvFTHO9ibfu9jB6NlglUMmaTcj0Wx0HB7TSskc4= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=YeR77mac; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="YeR77mac" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 353A8C116D0; Sun, 30 Nov 2025 11:18:27 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1764501512; bh=No4OV/v0jQLoJl8PKO7GG/i2wSSVo7iZmz3MRUSzNrc=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=YeR77mac03xw5SwBGRtzdZ/H+sOjspFJtYv8n6QV1GoUAtA5YBQtzB6KvA1McmoCj KG9n1DxTyB8f5KX+wIv7kufbSzy9gESul7fy+1PQlps2ZbpuCHnm//qNHYAl6rdb0g ppVz3ebKn5ThslvEpG/WIw0IQHAldkujMXFaHsjXgP69Fz5mFFnoaSO+M6Q/scZEFb zXrX8iNB+/84yNGOQcJwEYDy8DWE2tSsHnhfkHwtFBe/HSjUf9gGvwuYyXcHM6oz0c ORCVRYxO9icX1iVxjlcEPlwN8P+xjOcnQy4RUFOe38vEVO47X8Lwl16TMsJn4Sb/KE aPvK0vIq3++yA== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Nikita Kalyazin , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org, "David Hildenbrand (Red Hat)" Subject: [PATCH v3 2/5] userfaultfd, shmem: use a VMA callback to handle UFFDIO_CONTINUE Date: Sun, 30 Nov 2025 13:18:09 +0200 Message-ID: <20251130111812.699259-3-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251130111812.699259-1-rppt@kernel.org> References: <20251130111812.699259-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Mike Rapoport (Microsoft)" When userspace resolves a page fault in a shmem VMA with UFFDIO_CONTINUE it needs to get a folio that already exists in the pagecache backing that VMA. Instead of using shmem_get_folio() for that, add a get_folio_noalloc() method to 'struct vm_operations_struct' that will return a folio if it exists in the VMA's pagecache at given pgoff. Implement get_folio_noalloc() method for shmem and slightly refactor userfaultfd's mfill_atomic() and mfill_atomic_pte_continue() to support this new API. Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Liam R. Howlett Signed-off-by: Mike Rapoport (Microsoft) --- include/linux/mm.h | 9 ++++++++ mm/shmem.c | 18 ++++++++++++++++ mm/userfaultfd.c | 52 +++++++++++++++++++++++++++++----------------- 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c79b3369b82..6351a9cde360 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -690,6 +690,15 @@ struct vm_operations_struct { struct page *(*find_normal_page)(struct vm_area_struct *vma, unsigned long addr); #endif /* CONFIG_FIND_NORMAL_PAGE */ +#ifdef CONFIG_USERFAULTFD + /* + * Called by userfault to resolve UFFDIO_CONTINUE request. + * Should return the folio found at pgoff in the VMA's pagecache if it + * exists or ERR_PTR otherwise. + * The returned folio is locked and with reference held. + */ + struct folio *(*get_folio_noalloc)(struct inode *inode, pgoff_t pgoff); +#endif }; =20 #ifdef CONFIG_NUMA_BALANCING diff --git a/mm/shmem.c b/mm/shmem.c index 5a3f0f754dc0..9f8c54ad0e32 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3262,6 +3262,18 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, shmem_inode_unacct_blocks(inode, 1); return ret; } + +static struct folio *shmem_get_folio_noalloc(struct inode *inode, pgoff_t = pgoff) +{ + struct folio *folio; + int err; + + err =3D shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); + if (err) + return ERR_PTR(err); + + return folio; +} #endif /* CONFIG_USERFAULTFD */ =20 #ifdef CONFIG_TMPFS @@ -5294,6 +5306,9 @@ static const struct vm_operations_struct shmem_vm_ops= =3D { .set_policy =3D shmem_set_policy, .get_policy =3D shmem_get_policy, #endif +#ifdef CONFIG_USERFAULTFD + .get_folio_noalloc =3D shmem_get_folio_noalloc, +#endif }; =20 static const struct vm_operations_struct shmem_anon_vm_ops =3D { @@ -5303,6 +5318,9 @@ static const struct vm_operations_struct shmem_anon_v= m_ops =3D { .set_policy =3D shmem_set_policy, .get_policy =3D shmem_get_policy, #endif +#ifdef CONFIG_USERFAULTFD + .get_folio_noalloc =3D shmem_get_folio_noalloc, +#endif }; =20 int shmem_init_fs_context(struct fs_context *fc) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 8dc964389b0d..5610f29dac73 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -388,15 +388,12 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, struct page *page; int ret; =20 - ret =3D shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC); + folio =3D dst_vma->vm_ops->get_folio_noalloc(inode, pgoff); /* Our caller expects us to return -EFAULT if we failed to find folio */ - if (ret =3D=3D -ENOENT) - ret =3D -EFAULT; - if (ret) - goto out; - if (!folio) { - ret =3D -EFAULT; - goto out; + if (IS_ERR_OR_NULL(folio)) { + if (PTR_ERR(folio) =3D=3D -ENOENT || !folio) + return -EFAULT; + return PTR_ERR(folio); } =20 page =3D folio_file_page(folio, pgoff); @@ -411,13 +408,12 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, goto out_release; =20 folio_unlock(folio); - ret =3D 0; -out: - return ret; + return 0; + out_release: folio_unlock(folio); folio_put(folio); - goto out; + return ret; } =20 /* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ @@ -694,6 +690,15 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t = *dst_pmd, return err; } =20 +static __always_inline bool vma_can_mfill_atomic(struct vm_area_struct *vm= a, + uffd_flags_t flags) +{ + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) + return vma->vm_ops && vma->vm_ops->get_folio_noalloc; + + return vma_is_anonymous(vma) || vma_is_shmem(vma); +} + static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx, unsigned long dst_start, unsigned long src_start, @@ -766,10 +771,7 @@ static __always_inline ssize_t mfill_atomic(struct use= rfaultfd_ctx *ctx, return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start, len, flags); =20 - if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma)) - goto out_unlock; - if (!vma_is_shmem(dst_vma) && - uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) + if (!vma_can_mfill_atomic(dst_vma, flags)) goto out_unlock; =20 while (src_addr < src_start + len) { @@ -1985,9 +1987,21 @@ bool vma_can_userfault(struct vm_area_struct *vma, v= m_flags_t vm_flags, if (vma->vm_flags & VM_DROPPABLE) return false; =20 - if ((vm_flags & VM_UFFD_MINOR) && - (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) - return false; + if (vm_flags & VM_UFFD_MINOR) { + /* + * If only MINOR mode is requested and we can request an + * existing folio from VMA's page cache, allow it + */ + if (vm_flags =3D=3D VM_UFFD_MINOR && vma->vm_ops && + vma->vm_ops->get_folio_noalloc) + return true; + /* + * Only hugetlb and shmem can support MINOR mode in combination + * with other modes + */ + if (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)) + return false; + } =20 /* * If wp async enabled, and WP is the only mode enabled, allow any --=20 2.51.0 From nobody Mon Dec 1 22:06:15 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id A871E27A10F; Sun, 30 Nov 2025 11:18:38 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1764501518; cv=none; b=gqtHnRvOp3BWk7bYMmrd1wb6qmuclilVO1X1h6OR0/nbr6jnM0x4rXADW8QZu5VGMLfNV9QtSOEbqKgHVfV/ytIXjgM5oGPlLa7FZx7/OP3reVMQ+lKs+W21yfr/mdiO7awcMCYtIIWz0hPEp9Volc3iihPzIiH/JcTj3rXHoto= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1764501518; c=relaxed/simple; bh=kn1TtLxUvl6eK2HP+wAu3G67BEAmX/GRyBo9yQ844/o=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=uUVycJU35+Mzq4loUp6vA++lhbXC7CfcJJHAFxztJ6svxTm43or2P/c54AdKueKDx7zOBrilzimBqEn8VIbk98OidprymHBOO4ZK+eA10UCqdSmuhXqUnUIJRBK6eP40NogVy8WxcNItYTpK3UtD3QLty/ZPkPi1GRe3Q1hF5yQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=GTEOZ3Eu; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="GTEOZ3Eu" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 07FF8C4CEF8; Sun, 30 Nov 2025 11:18:32 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1764501518; bh=kn1TtLxUvl6eK2HP+wAu3G67BEAmX/GRyBo9yQ844/o=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=GTEOZ3Eu7EhOOpw0QceVSQ+bNm50XMZUIp8tcd4Zgvs0pWvWMI60KTlM1/2qOsjs8 0QNc7mcCdeh+ZYHJdpxS3KVSQ/SnooeWvfX+xjVwFwOh4Ayxh+9nNH2/z4SVTQ2tru XyufTc0QVGWP4mrQ1s+rWBm3P5+VvAxusEBYIHg43KX0tY4wWgDncOlh2ocwiBSqYe Gj8ZzBLL92wwmYW1GSIbx7rQtkZp+Ed1rAhaF7XuAsqylo7q5xqWa9fqzLfpNrRXho f84MYsqTzz/pbvZL+4dlHYdlYCUFxPVyZXIS9nNk422W47CGVnoXf9AyAZRUfhrIhP +UKoQejLcsRGg== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Nikita Kalyazin , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org, "David Hildenbrand (Red Hat)" Subject: [PATCH v3 3/5] mm: introduce VM_FAULT_UFFD_MINOR fault reason Date: Sun, 30 Nov 2025 13:18:10 +0200 Message-ID: <20251130111812.699259-4-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251130111812.699259-1-rppt@kernel.org> References: <20251130111812.699259-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Mike Rapoport (Microsoft)" When a VMA is registered with userfaulfd in minor mode, its ->fault() method should check if a folio exists in the page cache and if yes ->fault() should call handle_userfault(VM_UFFD_MINOR). Instead of calling handle_userfault() directly from a specific ->fault() implementation introduce new fault reason VM_FAULT_UFFD_MINOR that will notify the core page fault handler that it should call handle_userfaultfd(VM_UFFD_MINOR) to complete a page fault. Replace a call to handle_userfault(VM_UFFD_MINOR) in shmem and use the new VM_FAULT_UFFD_MINOR there instead. For configurations that don't enable CONFIG_USERFAULTFD, VM_FAULT_UFFD_MINOR is set to 0. Suggested-by: David Hildenbrand (Red Hat) Signed-off-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand (Red Hat) --- include/linux/mm_types.h | 10 +++++++++- mm/memory.c | 5 ++++- mm/shmem.c | 2 +- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 90e5790c318f..c92a52c572c3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1523,6 +1523,8 @@ typedef __bitwise unsigned int vm_fault_t; * fsync() to complete (for synchronous page faults * in DAX) * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released + * @VM_FAULT_UFFD_MINOR: ->fault did not modify page tables and needs + * handle_userfault(VM_UFFD_MINOR) to complete * @VM_FAULT_HINDEX_MASK: mask HINDEX value * */ @@ -1540,6 +1542,11 @@ enum vm_fault_reason { VM_FAULT_DONE_COW =3D (__force vm_fault_t)0x001000, VM_FAULT_NEEDDSYNC =3D (__force vm_fault_t)0x002000, VM_FAULT_COMPLETED =3D (__force vm_fault_t)0x004000, +#ifdef CONFIG_USERFAULTFD + VM_FAULT_UFFD_MINOR =3D (__force vm_fault_t)0x008000, +#else + VM_FAULT_UFFD_MINOR =3D (__force vm_fault_t)0x000000, +#endif VM_FAULT_HINDEX_MASK =3D (__force vm_fault_t)0x0f0000, }; =20 @@ -1564,7 +1571,8 @@ enum vm_fault_reason { { VM_FAULT_FALLBACK, "FALLBACK" }, \ { VM_FAULT_DONE_COW, "DONE_COW" }, \ { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \ - { VM_FAULT_COMPLETED, "COMPLETED" } + { VM_FAULT_COMPLETED, "COMPLETED" }, \ + { VM_FAULT_UFFD_MINOR, "UFFD_MINOR" } \ =20 struct vm_special_mapping { const char *name; /* The name, e.g. "[vdso]". */ diff --git a/mm/memory.c b/mm/memory.c index b59ae7ce42eb..8d2180ec6933 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5280,8 +5280,11 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) =20 ret =3D vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | - VM_FAULT_DONE_COW))) + VM_FAULT_DONE_COW | VM_FAULT_UFFD_MINOR))) { + if (ret & VM_FAULT_UFFD_MINOR) + return handle_userfault(vmf, VM_UFFD_MINOR); return ret; + } =20 folio =3D page_folio(vmf->page); if (unlikely(PageHWPoison(vmf->page))) { diff --git a/mm/shmem.c b/mm/shmem.c index 9f8c54ad0e32..2c32e2398e99 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2460,7 +2460,7 @@ static int shmem_get_folio_gfp(struct inode *inode, p= goff_t index, if (folio && vma && userfaultfd_minor(vma)) { if (!xa_is_value(folio)) folio_put(folio); - *fault_type =3D handle_userfault(vmf, VM_UFFD_MINOR); + *fault_type =3D VM_FAULT_UFFD_MINOR; return 0; } =20 --=20 2.51.0 From nobody Mon Dec 1 22:06:15 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7EA76284883; Sun, 30 Nov 2025 11:18:44 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1764501524; cv=none; b=fkPNRrUMHwQ0c1JwjTeLTaELt44x8z+tDXozUOmg8A2m83awWVEPbCoJ4aPw2EfpcOmrn1Iwfm8bwrukudBloU7CqEdKF7n+rcXdMCQPCig/P8sp0ZQV9uEk67vVJ3lrQqewH6tAyPhXkA5NxMJmMr1scRkQUEjbUC2QCJeBMHM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1764501524; c=relaxed/simple; bh=02TV+J21aRh8Y7GHD/quPCqz9iq/kEt5UBYeya9wXbk=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=JpwMmBfdoYLpIfkbjlpFF/sS8Up7fXVrveGXmyqGFVp/5x2raiY5/3swy88YUUypYpqntb3+jg13AJK+5CLHx2HDpF34/gun3CmBVPJtN17vf2f7vyeEUGqEJstX/Bpd7ASl6PmtdrKqCXz6KfFDjb1U1B+CJqToxBDXxsGcKRs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=ha1VpMPt; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="ha1VpMPt" Received: by smtp.kernel.org (Postfix) with ESMTPSA id CE8BCC116B1; Sun, 30 Nov 2025 11:18:38 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1764501523; bh=02TV+J21aRh8Y7GHD/quPCqz9iq/kEt5UBYeya9wXbk=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=ha1VpMPtHwMP3wcPEvYjsA/TXMScV1CPNuo/cC5Lu3DHFteFKJfHohkZUPQKy0KVH Jz15kY7NsoNnb2ifdGn0TVWvzfZyhLUyxG/9gwIiLivBDv2L+6NlrOm5HO7uAQ94VQ W43OZVat768mPtgYQldsGVGP+DmrQ8p0eWovjw7NXQrngRyeR+P0y9W6NokDg6dW8c WIecaNNGhqHcEWomX9JH4q2tcK5gXHSoUMT8n5MUCD6/p4ACgXX3jGubo4z7Y/nL96 nIZjMnQQSlLbYLMkRCy4SeLc/cuF1vSR8fRe2LRWKD8ko4jltWC0qCn1GUXPbYN46g CtYKcNVHiSLtg== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Nikita Kalyazin , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH v3 4/5] guest_memfd: add support for userfaultfd minor mode Date: Sun, 30 Nov 2025 13:18:11 +0200 Message-ID: <20251130111812.699259-5-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251130111812.699259-1-rppt@kernel.org> References: <20251130111812.699259-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: "Mike Rapoport (Microsoft)" userfaultfd notifications about minor page faults used for live migration and snapshotting of VMs with memory backed by shared hugetlbfs or tmpfs mappings as described in detail in commit 7677f7fd8be7 ("userfaultfd: add minor fault registration mode"). To use the same mechanism for VMs that use guest_memfd to map their memory, guest_memfd should support userfaultfd minor mode. Extend ->fault() method of guest_memfd with ability to notify core page fault handler that a page fault requires handle_userfault(VM_UFFD_MINOR) to complete and add implementation of ->get_folio_noalloc() to guest_memfd vm_ops. Reviewed-by: Liam R. Howlett Signed-off-by: Mike Rapoport (Microsoft) --- virt/kvm/guest_memfd.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index ffadc5ee8e04..dca6e373937b 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -4,6 +4,7 @@ #include #include #include +#include =20 #include "kvm_mm.h" =20 @@ -359,7 +360,15 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct v= m_fault *vmf) if (!((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED)) return VM_FAULT_SIGBUS; =20 - folio =3D kvm_gmem_get_folio(inode, vmf->pgoff); + folio =3D filemap_lock_folio(inode->i_mapping, vmf->pgoff); + if (!IS_ERR_OR_NULL(folio) && userfaultfd_minor(vmf->vma)) { + ret =3D VM_FAULT_UFFD_MINOR; + goto out_folio; + } + + if (PTR_ERR(folio) =3D=3D -ENOENT) + folio =3D kvm_gmem_get_folio(inode, vmf->pgoff); + if (IS_ERR(folio)) { int err =3D PTR_ERR(folio); =20 @@ -390,8 +399,30 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct v= m_fault *vmf) return ret; } =20 +#ifdef CONFIG_USERFAULTFD +static struct folio *kvm_gmem_get_folio_noalloc(struct inode *inode, + pgoff_t pgoff) +{ + struct folio *folio; + + folio =3D filemap_lock_folio(inode->i_mapping, pgoff); + if (IS_ERR_OR_NULL(folio)) + return folio; + + if (!folio_test_uptodate(folio)) { + clear_highpage(folio_page(folio, 0)); + kvm_gmem_mark_prepared(folio); + } + + return folio; +} +#endif + static const struct vm_operations_struct kvm_gmem_vm_ops =3D { .fault =3D kvm_gmem_fault_user_mapping, +#ifdef CONFIG_USERFAULTFD + .get_folio_noalloc =3D kvm_gmem_get_folio_noalloc, +#endif }; =20 static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) --=20 2.51.0 From nobody Mon Dec 1 22:06:15 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 5310027934B; Sun, 30 Nov 2025 11:18:50 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1764501530; cv=none; b=ph9eDuC9uVJ4HsjG0zO5RhWFBmNDkhqazf0ij6QwKEHA1l1syW3L/xZ4tuNTmSierSYPq54ewlOCXwi5l4xvtvJqf2b15t4Iw+0zWZghiYOb7H0MAXSm8rof8kBRBU4u3Es8UR8V9C/oAABn8IgFkVb1mmQJCOeO25vrxF5IJFw= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1764501530; c=relaxed/simple; bh=Mv2TNIorv+Vl9StJ3rAHbansoPrEBArJPuXjfAYQx5k=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=oVBum5Xw1NXmf9177cJapKTu4ucuf8sotCu/NSZJ2WZ7VLm2K/GZLRSh4JiSpGa16qY+GGLQmMHTNLYsBuvJj4q3b8xJCrWa1o+x/e1bkptz+iGWPW1c2mquSwGvIBwkwYAf8nb4duNhZTR/AHDSd1SgbeDVAWUIk7b+SvtsiLQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=ZgJ9uLCf; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="ZgJ9uLCf" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 6C503C4CEF8; Sun, 30 Nov 2025 11:18:44 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1764501529; bh=Mv2TNIorv+Vl9StJ3rAHbansoPrEBArJPuXjfAYQx5k=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=ZgJ9uLCflbDe4V56Zn+EhA12EOoiQEDWrl58/vwTrn/T4zlaUp6u7U3yzJWKNkAxr IiLbuZjOEiOHf8cf1oxIXvL+mkpo7nuKWwJtUhv6kZ5cCPBvMYAZuwSUksyQEhMRrN ESz3zJM8btnS19tBA/iOaUxlfzXqxaagg/uoN0X+ievjN5jI7dA9P7iL2kAF7WCwT9 FaGvriFrMqPQvu/eRl34K8I7AQjTDZ560xypBAsdG7aDOo+PzBJMAPOEbH06BfhH7W F1dhejn+vvbHZdH6tzG3bDJJLAt3vB03eLs3ueip6Ddi1Joqaj0QUxw0NIrBm29sLT 7Jk8HEn/9CCPg== From: Mike Rapoport To: linux-mm@kvack.org Cc: Andrea Arcangeli , Andrew Morton , Axel Rasmussen , Baolin Wang , David Hildenbrand , Hugh Dickins , James Houghton , "Liam R. Howlett" , Lorenzo Stoakes , Michal Hocko , Mike Rapoport , Nikita Kalyazin , Paolo Bonzini , Peter Xu , Sean Christopherson , Shuah Khan , Suren Baghdasaryan , Vlastimil Babka , linux-kernel@vger.kernel.org, kvm@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [PATCH v3 5/5] KVM: selftests: test userfaultfd minor for guest_memfd Date: Sun, 30 Nov 2025 13:18:12 +0200 Message-ID: <20251130111812.699259-6-rppt@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251130111812.699259-1-rppt@kernel.org> References: <20251130111812.699259-1-rppt@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" From: Nikita Kalyazin The test demonstrates that a minor userfaultfd event in guest_memfd can be resolved via a memcpy followed by a UFFDIO_CONTINUE ioctl. Acked-by: Liam R. Howlett Signed-off-by: Nikita Kalyazin Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) --- .../testing/selftests/kvm/guest_memfd_test.c | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing= /selftests/kvm/guest_memfd_test.c index e7d9aeb418d3..d0cf57d41cc9 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -10,13 +10,17 @@ #include #include #include +#include =20 #include #include #include +#include #include #include #include +#include +#include =20 #include "kvm_util.h" #include "test_util.h" @@ -254,6 +258,98 @@ static void test_guest_memfd_flags(struct kvm_vm *vm) } } =20 +struct fault_args { + char *addr; + volatile char value; +}; + +static void *fault_thread_fn(void *arg) +{ + struct fault_args *args =3D arg; + + /* Trigger page fault */ + args->value =3D *args->addr; + return NULL; +} + +static void test_uffd_minor(int fd, size_t total_size) +{ + struct uffdio_api uffdio_api =3D { + .api =3D UFFD_API, + }; + struct uffdio_register uffd_reg; + struct uffdio_continue uffd_cont; + struct uffd_msg msg; + struct fault_args args; + pthread_t fault_thread; + void *mem, *mem_nofault; + int uffd, ret; + off_t offset =3D page_size; + void *fault_addr; + + uffd =3D syscall(__NR_userfaultfd, O_CLOEXEC); + TEST_ASSERT(uffd !=3D -1, "userfaultfd creation should succeed"); + + ret =3D ioctl(uffd, UFFDIO_API, &uffdio_api); + TEST_ASSERT(ret !=3D -1, "ioctl(UFFDIO_API) should succeed"); + + mem =3D mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + TEST_ASSERT(mem !=3D MAP_FAILED, "mmap should succeed"); + + mem_nofault =3D mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_SHARED= , fd, 0); + TEST_ASSERT(mem_nofault !=3D MAP_FAILED, "mmap should succeed"); + + uffd_reg.range.start =3D (unsigned long)mem; + uffd_reg.range.len =3D total_size; + uffd_reg.mode =3D UFFDIO_REGISTER_MODE_MINOR; + ret =3D ioctl(uffd, UFFDIO_REGISTER, &uffd_reg); + TEST_ASSERT(ret !=3D -1, "ioctl(UFFDIO_REGISTER) should succeed"); + + ret =3D fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + offset, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) should succeed"); + + memset(mem_nofault + offset, 0xaa, page_size); + + fault_addr =3D mem + offset; + args.addr =3D fault_addr; + + ret =3D pthread_create(&fault_thread, NULL, fault_thread_fn, &args); + TEST_ASSERT(ret =3D=3D 0, "pthread_create should succeed"); + + ret =3D read(uffd, &msg, sizeof(msg)); + TEST_ASSERT(ret !=3D -1, "read from userfaultfd should succeed"); + TEST_ASSERT(msg.event =3D=3D UFFD_EVENT_PAGEFAULT, "event type should be = pagefault"); + TEST_ASSERT((void *)(msg.arg.pagefault.address & ~(page_size - 1)) =3D=3D= fault_addr, + "pagefault should occur at expected address"); + + + uffd_cont.range.start =3D (unsigned long)fault_addr; + uffd_cont.range.len =3D page_size; + uffd_cont.mode =3D 0; + ret =3D ioctl(uffd, UFFDIO_CONTINUE, &uffd_cont); + TEST_ASSERT(ret !=3D -1, "ioctl(UFFDIO_CONTINUE) should succeed"); + + /* + * wait for fault_thread to finish to make sure fault happened and was + * resolved before we verify the values + */ + ret =3D pthread_join(fault_thread, NULL); + TEST_ASSERT(ret =3D=3D 0, "pthread_join should succeed"); + + TEST_ASSERT(args.value =3D=3D *(char *)(mem_nofault + offset), + "memory should contain the value that was copied"); + TEST_ASSERT(args.value =3D=3D *(char *)(mem + offset), + "no further fault is expected"); + + ret =3D munmap(mem_nofault, total_size); + TEST_ASSERT(!ret, "munmap should succeed"); + + ret =3D munmap(mem, total_size); + TEST_ASSERT(!ret, "munmap should succeed"); + close(uffd); +} + #define gmem_test(__test, __vm, __flags) \ do { \ int fd =3D vm_create_guest_memfd(__vm, page_size * 4, __flags); \ @@ -273,6 +369,7 @@ static void __test_guest_memfd(struct kvm_vm *vm, uint6= 4_t flags) if (flags & GUEST_MEMFD_FLAG_INIT_SHARED) { gmem_test(mmap_supported, vm, flags); gmem_test(fault_overflow, vm, flags); + gmem_test(uffd_minor, vm, flags); } else { gmem_test(fault_private, vm, flags); } --=20 2.51.0