Introduce a generic userfaultfd API for vm_operations_struct, so that one
vma, especially when as a module, can support userfaults without modifying
the core files. More importantly, when the module can be compiled out of
the kernel.
So, instead of having core mm referencing modules that may not ever exist,
we need to have modules opt-in on core mm hooks instead.
After this API applied, if a module wants to support userfaultfd, the
module should only need to touch its own file and properly define
vm_uffd_ops, instead of changing anything in core mm.
Note that such API will not work for anonymous. Core mm will process
anonymous memory separately for userfault operations like before.
This patch only introduces the API alone so that we can start to move
existing users over but without breaking them.
Currently the uffd_copy() API is almost designed to be the simplistic with
minimum mm changes to move over to the API.
Signed-off-by: Peter Xu <peterx@redhat.com>
---
include/linux/mm.h | 71 +++++++++++++++++++++++++++++++++++
include/linux/userfaultfd_k.h | 12 ------
2 files changed, 71 insertions(+), 12 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 98a606908307..8dfd83f01d3d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -576,6 +576,70 @@ struct vm_fault {
*/
};
+#ifdef CONFIG_USERFAULTFD
+/* A combined operation mode + behavior flags. */
+typedef unsigned int __bitwise uffd_flags_t;
+
+enum mfill_atomic_mode {
+ MFILL_ATOMIC_COPY,
+ MFILL_ATOMIC_ZEROPAGE,
+ MFILL_ATOMIC_CONTINUE,
+ MFILL_ATOMIC_POISON,
+ NR_MFILL_ATOMIC_MODES,
+};
+
+/* VMA userfaultfd operations */
+typedef struct {
+ /**
+ * @uffd_features: features supported in bitmask.
+ *
+ * When the ops is defined, the driver must set non-zero features
+ * to be a subset (or all) of: VM_UFFD_MISSING|WP|MINOR.
+ */
+ unsigned long uffd_features;
+ /**
+ * @uffd_ioctls: ioctls supported in bitmask.
+ *
+ * Userfaultfd ioctls supported by the module. Below will always
+ * be supported by default whenever a module provides vm_uffd_ops:
+ *
+ * _UFFDIO_API, _UFFDIO_REGISTER, _UFFDIO_UNREGISTER, _UFFDIO_WAKE
+ *
+ * The module needs to provide all the rest optionally supported
+ * ioctls. For example, when VM_UFFD_MISSING was supported,
+ * _UFFDIO_COPY must be supported as ioctl, while _UFFDIO_ZEROPAGE
+ * is optional.
+ */
+ unsigned long uffd_ioctls;
+ /**
+ * uffd_get_folio: Handler to resolve UFFDIO_CONTINUE request.
+ *
+ * @inode: the inode for folio lookup
+ * @pgoff: the pgoff of the folio
+ * @folio: returned folio pointer
+ *
+ * Return: zero if succeeded, negative for errors.
+ */
+ int (*uffd_get_folio)(struct inode *inode, pgoff_t pgoff,
+ struct folio **folio);
+ /**
+ * uffd_copy: Handler to resolve UFFDIO_COPY|ZEROPAGE request.
+ *
+ * @dst_pmd: target pmd to resolve page fault
+ * @dst_vma: target vma
+ * @dst_addr: target virtual address
+ * @src_addr: source address to copy from
+ * @flags: userfaultfd request flags
+ * @foliop: previously allocated folio
+ *
+ * Return: zero if succeeded, negative for errors.
+ */
+ int (*uffd_copy)(pmd_t *dst_pmd, struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ uffd_flags_t flags, struct folio **foliop);
+} vm_uffd_ops;
+#endif
+
/*
* These are the virtual MM functions - opening of an area, closing and
* unmapping it (needed to keep files on disk up-to-date etc), pointer
@@ -653,6 +717,13 @@ struct vm_operations_struct {
*/
struct page *(*find_special_page)(struct vm_area_struct *vma,
unsigned long addr);
+#ifdef CONFIG_USERFAULTFD
+ /*
+ * Userfaultfd related ops. Modules need to define this to support
+ * userfaultfd.
+ */
+ const vm_uffd_ops *userfaultfd_ops;
+#endif
};
#ifdef CONFIG_NUMA_BALANCING
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index ccad58602846..e79c724b3b95 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -80,18 +80,6 @@ struct userfaultfd_ctx {
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
-/* A combined operation mode + behavior flags. */
-typedef unsigned int __bitwise uffd_flags_t;
-
-/* Mutually exclusive modes of operation. */
-enum mfill_atomic_mode {
- MFILL_ATOMIC_COPY,
- MFILL_ATOMIC_ZEROPAGE,
- MFILL_ATOMIC_CONTINUE,
- MFILL_ATOMIC_POISON,
- NR_MFILL_ATOMIC_MODES,
-};
-
#define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1)
#define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr))
#define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr))
--
2.49.0
On 20.06.25 21:03, Peter Xu wrote: Hi Peter, > Introduce a generic userfaultfd API for vm_operations_struct, so that one > vma, especially when as a module, can support userfaults without modifying The sentence is confusing ("vma ... as a module"). Did you mean something like ".. so that a vma that is backed by a special-purpose in-memory filesystem like shmem or hugetlb can support userfaultfd without modifying the uffd core; this is required when the in-memory filesystem is built as a module." > the core files. More importantly, when the module can be compiled out of > the kernel. > > So, instead of having core mm referencing modules that may not ever exist, > we need to have modules opt-in on core mm hooks instead. > > After this API applied, if a module wants to support userfaultfd, the > module should only need to touch its own file and properly define > vm_uffd_ops, instead of changing anything in core mm. Talking about modules that much is a bit confusing. I think this is more about cleanly supporting in-memory filesystems, without the need to special-case each and every one of them; can be viewed a cleanup independent of the module requirement from guest_memfd. > > Note that such API will not work for anonymous. Core mm will process > anonymous memory separately for userfault operations like before. > > This patch only introduces the API alone so that we can start to move > existing users over but without breaking them. > > Currently the uffd_copy() API is almost designed to be the simplistic with > minimum mm changes to move over to the API. > Is there a way to move part of the actual implementation (how this is all wired up) from patch #4 into this patch, to then only remove the old shmem/hugetlb hooks (that are effectively unused) in patch #4? -- Cheers, David / dhildenb
On Mon, Jun 23, 2025 at 10:25:33AM +0200, David Hildenbrand wrote: > On 20.06.25 21:03, Peter Xu wrote: > > Hi Peter, Hey David, > > > Introduce a generic userfaultfd API for vm_operations_struct, so that one > > vma, especially when as a module, can support userfaults without modifying > > The sentence is confusing ("vma ... as a module"). > > Did you mean something like ".. so that a vma that is backed by a > special-purpose in-memory filesystem like shmem or hugetlb can support > userfaultfd without modifying the uffd core; this is required when the > in-memory filesystem is built as a module." I wanted to avoid mentioning of "in-memory file systems" here. How about an updated commit like this? Currently, most of the userfaultfd features are implemented directly in the core mm. It will invoke VMA specific functions whenever necessary. So far it is fine because it almost only interacts with shmem and hugetlbfs. This patch introduces a generic userfaultfd API for vm_operations_struct, so that any type of file (including kernel modules that can be compiled separately from the kernel core) can support userfaults without modifying the core files. After this API applied, if a module wants to support userfaultfd, the module should only need to touch its own file and properly define vm_uffd_ops, instead of changing anything in core mm. ... > > > the core files. More importantly, when the module can be compiled out of > > the kernel. > > > > So, instead of having core mm referencing modules that may not ever exist, > > we need to have modules opt-in on core mm hooks instead. > > > > After this API applied, if a module wants to support userfaultfd, the > > module should only need to touch its own file and properly define > > vm_uffd_ops, instead of changing anything in core mm. > > Talking about modules that much is a bit confusing. I think this is more > about cleanly supporting in-memory filesystems, without the need to > special-case each and every one of them; can be viewed a cleanup independent > of the module requirement from guest_memfd. Yes. But if we don't need to support kernel modules actually we don't need this.. IMHO it's so far really about cleanly support kernel modules, which can even be out-of-tree (though that's not my purpose of the change..). Please help check if above updated commit message would be better. > > > > > Note that such API will not work for anonymous. Core mm will process > > anonymous memory separately for userfault operations like before. > > > > This patch only introduces the API alone so that we can start to move > > existing users over but without breaking them. > > > > Currently the uffd_copy() API is almost designed to be the simplistic with > > minimum mm changes to move over to the API. > > > > Is there a way to move part of the actual implementation (how this is all > wired up) from patch #4 into this patch, to then only remove the old > shmem/hugetlb hooks (that are effectively unused) in patch #4? Not much I really removed on the hooks, but I was trying to reuse almost existing functions. Here hugetlb is almost untouched on hooks, then I reused the shmem existing function for uffd_copy() rather than removing it (I did need to remove the definition in the shmem header though becuse it's not needed to be exported). The major thing got removed in patch 4 was some random checks over uffd ops and vma flags. I intentionally made them all in patch 4 to make review possible. Otherwise it can be slightly awkward to reason what got removed without knowing what is protecting those checks. Thanks, -- Peter Xu
On 23.06.25 15:59, Peter Xu wrote: > On Mon, Jun 23, 2025 at 10:25:33AM +0200, David Hildenbrand wrote: >> On 20.06.25 21:03, Peter Xu wrote: >> >> Hi Peter, > > Hey David, > >> >>> Introduce a generic userfaultfd API for vm_operations_struct, so that one >>> vma, especially when as a module, can support userfaults without modifying >> >> The sentence is confusing ("vma ... as a module"). >> >> Did you mean something like ".. so that a vma that is backed by a >> special-purpose in-memory filesystem like shmem or hugetlb can support >> userfaultfd without modifying the uffd core; this is required when the >> in-memory filesystem is built as a module." > > I wanted to avoid mentioning of "in-memory file systems" here. I thought one of the challenges of supporting guest_memfd on anything that is not a special in-memory file system is also related to how the pagecache handles readahead. So ... > > How about an updated commit like this? > > Currently, most of the userfaultfd features are implemented directly in the > core mm. It will invoke VMA specific functions whenever necessary. So far > it is fine because it almost only interacts with shmem and hugetlbfs. > > This patch introduces a generic userfaultfd API for vm_operations_struct, > so that any type of file (including kernel modules that can be compiled > separately from the kernel core) can support userfaults without modifying > the core files. .... is it really "any file" ? I doubt it, but you likely have a better idea on how it all could just work with "any file". > > After this API applied, if a module wants to support userfaultfd, the > module should only need to touch its own file and properly define > vm_uffd_ops, instead of changing anything in core mm. > > ... Talking about files and modules is still confusing I'm afraid. It's really a special-purpose file (really, not any ordinary files on ordinary filesystems), no? > >> >>> the core files. More importantly, when the module can be compiled out of >>> the kernel. >>> >>> So, instead of having core mm referencing modules that may not ever exist, >>> we need to have modules opt-in on core mm hooks instead. >>> >>> After this API applied, if a module wants to support userfaultfd, the >>> module should only need to touch its own file and properly define >>> vm_uffd_ops, instead of changing anything in core mm. >> >> Talking about modules that much is a bit confusing. I think this is more >> about cleanly supporting in-memory filesystems, without the need to >> special-case each and every one of them; can be viewed a cleanup independent >> of the module requirement from guest_memfd. > > Yes. But if we don't need to support kernel modules actually we don't need > this.. IMHO it's so far really about cleanly support kernel modules, which > can even be out-of-tree (though that's not my purpose of the change..). > > Please help check if above updated commit message would be better. I agree that another special-purpose file (like implemented by guest_memfd) would need that. But if we could get rid of "hugetlb"/"shmem" special-casing in userfaultfd, it would be a rasonable independent cleanup. But I can spot in patch #3 now: "Hugetlbfs still has its own hard-coded handler in userfaultfd, due to limitations similar to vm_operations_struct.fault(). TODO: generalize it to use the API function." I would have hoped that we clean that up in one go instead. > >> >>> >>> Note that such API will not work for anonymous. Core mm will process >>> anonymous memory separately for userfault operations like before. >>> >>> This patch only introduces the API alone so that we can start to move >>> existing users over but without breaking them. >>> >>> Currently the uffd_copy() API is almost designed to be the simplistic with >>> minimum mm changes to move over to the API. >>> >> >> Is there a way to move part of the actual implementation (how this is all >> wired up) from patch #4 into this patch, to then only remove the old >> shmem/hugetlb hooks (that are effectively unused) in patch #4? > > Not much I really removed on the hooks, but I was trying to reuse almost > existing functions. Here hugetlb is almost untouched on hooks, then I > reused the shmem existing function for uffd_copy() rather than removing it > (I did need to remove the definition in the shmem header though becuse it's > not needed to be exported). > > The major thing got removed in patch 4 was some random checks over uffd ops > and vma flags. I intentionally made them all in patch 4 to make review > possible. Otherwise it can be slightly awkward to reason what got removed > without knowing what is protecting those checks. Agreed. It's a shame the new API is not a proper replacement for hugetlb special casing just yet ... -- Cheers, David / dhildenb
On Mon, Jun 23, 2025 at 06:50:42PM +0200, David Hildenbrand wrote: > On 23.06.25 15:59, Peter Xu wrote: > > On Mon, Jun 23, 2025 at 10:25:33AM +0200, David Hildenbrand wrote: > > > On 20.06.25 21:03, Peter Xu wrote: > > > > > > Hi Peter, > > > > Hey David, > > > > > > > > > Introduce a generic userfaultfd API for vm_operations_struct, so that one > > > > vma, especially when as a module, can support userfaults without modifying > > > > > > The sentence is confusing ("vma ... as a module"). > > > > > > Did you mean something like ".. so that a vma that is backed by a > > > special-purpose in-memory filesystem like shmem or hugetlb can support > > > userfaultfd without modifying the uffd core; this is required when the > > > in-memory filesystem is built as a module." > > > > I wanted to avoid mentioning of "in-memory file systems" here. > > I thought one of the challenges of supporting guest_memfd on anything that > is not a special in-memory file system is also related to how the pagecache > handles readahead. > > So ... See uffd_disable_fault_around(). We should make sure no such happens into pgtables when some special type of file is suppoorted, if it ever happens, besides shmem. IIUC readahead on page caches are fine for non-MISSING traps. So a file can support MINOR, for example, but then it'll also need to make sure all those aspected are well considered. > > > > > How about an updated commit like this? > > > > Currently, most of the userfaultfd features are implemented directly in the > > core mm. It will invoke VMA specific functions whenever necessary. So far > > it is fine because it almost only interacts with shmem and hugetlbfs. > > > > This patch introduces a generic userfaultfd API for vm_operations_struct, > > so that any type of file (including kernel modules that can be compiled > > separately from the kernel core) can support userfaults without modifying > > the core files. > > .... is it really "any file" ? I doubt it, but you likely have a better idea > on how it all could just work with "any file". > > > > > After this API applied, if a module wants to support userfaultfd, the > > module should only need to touch its own file and properly define > > vm_uffd_ops, instead of changing anything in core mm. > > > > ... > > Talking about files and modules is still confusing I'm afraid. It's really a > special-purpose file (really, not any ordinary files on ordinary > filesystems), no? One major reason I wanted to avoid the term "in-memory" is that we already support most of the files on WP_ASYNC, so emphasizing on in-memory might be misleading, even though WP_ASYNC isn't much taken into the picture of the vm_uffd_ops being proposed. The other thing is, besides the original form of userfaultfd (which is the MISSING traps), almost all the rest (sync-wp, continue, poison, maybe even MOVE but that's still more special) should be at least logically doable on most of the files like WP_ASYNC. When proposing this API, I wanted to make it as generic as possible when people reading about it. Hope that makes sense. Thanks, -- Peter Xu
>>> >>> How about an updated commit like this? >>> >>> Currently, most of the userfaultfd features are implemented directly in the >>> core mm. It will invoke VMA specific functions whenever necessary. So far >>> it is fine because it almost only interacts with shmem and hugetlbfs. >>> >>> This patch introduces a generic userfaultfd API for vm_operations_struct, >>> so that any type of file (including kernel modules that can be compiled >>> separately from the kernel core) can support userfaults without modifying >>> the core files. >> >> .... is it really "any file" ? I doubt it, but you likely have a better idea >> on how it all could just work with "any file". >> >>> >>> After this API applied, if a module wants to support userfaultfd, the >>> module should only need to touch its own file and properly define >>> vm_uffd_ops, instead of changing anything in core mm. >>> >>> ... >> >> Talking about files and modules is still confusing I'm afraid. It's really a >> special-purpose file (really, not any ordinary files on ordinary >> filesystems), no? > > One major reason I wanted to avoid the term "in-memory" is that we already > support most of the files on WP_ASYNC, so emphasizing on in-memory might be > misleading, even though WP_ASYNC isn't much taken into the picture of the > vm_uffd_ops being proposed. Oh, yes, agreed on WP_ASYNC. But they would not be using the vma_ops thingy, right? -- Cheers, David / dhildenb
On Mon, Jun 23, 2025 at 07:25:11PM +0200, David Hildenbrand wrote: > Oh, yes, agreed on WP_ASYNC. But they would not be using the vma_ops thingy, > right? Yes, currently WP_ASYNC bypassed that. So if something declares WP in the uffd_features in the new API it'll be only for sync (and it really should almost start working for most files too like what async did..). -- Peter Xu
Hi Peter, On Fri, Jun 20, 2025 at 03:03:39PM -0400, Peter Xu wrote: > Introduce a generic userfaultfd API for vm_operations_struct, so that one > vma, especially when as a module, can support userfaults without modifying > the core files. More importantly, when the module can be compiled out of > the kernel. > > So, instead of having core mm referencing modules that may not ever exist, > we need to have modules opt-in on core mm hooks instead. > > After this API applied, if a module wants to support userfaultfd, the > module should only need to touch its own file and properly define > vm_uffd_ops, instead of changing anything in core mm. > > Note that such API will not work for anonymous. Core mm will process > anonymous memory separately for userfault operations like before. > > This patch only introduces the API alone so that we can start to move > existing users over but without breaking them. > > Currently the uffd_copy() API is almost designed to be the simplistic with > minimum mm changes to move over to the API. > > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > include/linux/mm.h | 71 +++++++++++++++++++++++++++++++++++ > include/linux/userfaultfd_k.h | 12 ------ > 2 files changed, 71 insertions(+), 12 deletions(-) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 98a606908307..8dfd83f01d3d 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -576,6 +576,70 @@ struct vm_fault { > */ > }; > > +#ifdef CONFIG_USERFAULTFD > +/* A combined operation mode + behavior flags. */ > +typedef unsigned int __bitwise uffd_flags_t; > + > +enum mfill_atomic_mode { > + MFILL_ATOMIC_COPY, > + MFILL_ATOMIC_ZEROPAGE, > + MFILL_ATOMIC_CONTINUE, > + MFILL_ATOMIC_POISON, > + NR_MFILL_ATOMIC_MODES, > +}; > + > +/* VMA userfaultfd operations */ > +typedef struct { > + /** > + * @uffd_features: features supported in bitmask. > + * > + * When the ops is defined, the driver must set non-zero features > + * to be a subset (or all) of: VM_UFFD_MISSING|WP|MINOR. > + */ > + unsigned long uffd_features; > + /** > + * @uffd_ioctls: ioctls supported in bitmask. > + * > + * Userfaultfd ioctls supported by the module. Below will always > + * be supported by default whenever a module provides vm_uffd_ops: > + * > + * _UFFDIO_API, _UFFDIO_REGISTER, _UFFDIO_UNREGISTER, _UFFDIO_WAKE > + * > + * The module needs to provide all the rest optionally supported > + * ioctls. For example, when VM_UFFD_MISSING was supported, > + * _UFFDIO_COPY must be supported as ioctl, while _UFFDIO_ZEROPAGE > + * is optional. > + */ > + unsigned long uffd_ioctls; > + /** > + * uffd_get_folio: Handler to resolve UFFDIO_CONTINUE request. > + * > + * @inode: the inode for folio lookup > + * @pgoff: the pgoff of the folio > + * @folio: returned folio pointer > + * > + * Return: zero if succeeded, negative for errors. > + */ > + int (*uffd_get_folio)(struct inode *inode, pgoff_t pgoff, > + struct folio **folio); > + /** > + * uffd_copy: Handler to resolve UFFDIO_COPY|ZEROPAGE request. > + * > + * @dst_pmd: target pmd to resolve page fault > + * @dst_vma: target vma > + * @dst_addr: target virtual address > + * @src_addr: source address to copy from > + * @flags: userfaultfd request flags > + * @foliop: previously allocated folio > + * > + * Return: zero if succeeded, negative for errors. > + */ > + int (*uffd_copy)(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, > + unsigned long dst_addr, unsigned long src_addr, > + uffd_flags_t flags, struct folio **foliop); > +} vm_uffd_ops; > +#endif Can't we define vm_uffd_ops in userfaultfd_k.h? A forward declaration in mm.h should suffice and modules that want to use uffd can include userfaultfd_k.h. > + > /* > * These are the virtual MM functions - opening of an area, closing and > * unmapping it (needed to keep files on disk up-to-date etc), pointer > @@ -653,6 +717,13 @@ struct vm_operations_struct { > */ > struct page *(*find_special_page)(struct vm_area_struct *vma, > unsigned long addr); > +#ifdef CONFIG_USERFAULTFD > + /* > + * Userfaultfd related ops. Modules need to define this to support > + * userfaultfd. > + */ > + const vm_uffd_ops *userfaultfd_ops; > +#endif > }; > > #ifdef CONFIG_NUMA_BALANCING > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h > index ccad58602846..e79c724b3b95 100644 > --- a/include/linux/userfaultfd_k.h > +++ b/include/linux/userfaultfd_k.h > @@ -80,18 +80,6 @@ struct userfaultfd_ctx { > > extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); > > -/* A combined operation mode + behavior flags. */ > -typedef unsigned int __bitwise uffd_flags_t; > - > -/* Mutually exclusive modes of operation. */ > -enum mfill_atomic_mode { > - MFILL_ATOMIC_COPY, > - MFILL_ATOMIC_ZEROPAGE, > - MFILL_ATOMIC_CONTINUE, > - MFILL_ATOMIC_POISON, > - NR_MFILL_ATOMIC_MODES, > -}; > - > #define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1) > #define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr)) > #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr)) > -- > 2.49.0 > -- Sincerely yours, Mike.
On Sun, Jun 22, 2025 at 10:28:04AM +0300, Mike Rapoport wrote: > > +} vm_uffd_ops; > > +#endif > > Can't we define vm_uffd_ops in userfaultfd_k.h? > > A forward declaration in mm.h should suffice and modules that want to use > uffd can include userfaultfd_k.h. Good point, I'll do that, thanks! -- Peter Xu
© 2016 - 2025 Red Hat, Inc.