Introduce a generic userfaultfd API for vm_operations_struct, so that one
vma, especially when as a module, can support userfaults without modifying
the core files. More importantly, when the module can be compiled out of
the kernel.
So, instead of having core mm referencing modules that may not ever exist,
we need to have modules opt-in on core mm hooks instead.
After this API applied, if a module wants to support userfaultfd, the
module should only need to touch its own file and properly define
vm_uffd_ops, instead of changing anything in core mm.
Note that such API will not work for anonymous. Core mm will process
anonymous memory separately for userfault operations like before.
This patch only introduces the API alone so that we can start to move
existing users over but without breaking them.
Currently the uffd_copy() API is almost designed to be the simplistic with
minimum mm changes to move over to the API.
Signed-off-by: Peter Xu <peterx@redhat.com>
---
include/linux/mm.h | 9 ++++++
include/linux/userfaultfd_k.h | 52 +++++++++++++++++++++++++++++++++++
2 files changed, 61 insertions(+)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ef40f68c1183..6a5447bd43fd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -576,6 +576,8 @@ struct vm_fault {
*/
};
+struct vm_uffd_ops;
+
/*
* These are the virtual MM functions - opening of an area, closing and
* unmapping it (needed to keep files on disk up-to-date etc), pointer
@@ -653,6 +655,13 @@ struct vm_operations_struct {
*/
struct page *(*find_special_page)(struct vm_area_struct *vma,
unsigned long addr);
+#ifdef CONFIG_USERFAULTFD
+ /*
+ * Userfaultfd related ops. Modules need to define this to support
+ * userfaultfd.
+ */
+ const struct vm_uffd_ops *userfaultfd_ops;
+#endif
};
#ifdef CONFIG_NUMA_BALANCING
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index df85330bcfa6..c9a093c4502b 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -92,6 +92,58 @@ enum mfill_atomic_mode {
NR_MFILL_ATOMIC_MODES,
};
+/* VMA userfaultfd operations */
+struct vm_uffd_ops {
+ /**
+ * @uffd_features: features supported in bitmask.
+ *
+ * When the ops is defined, the driver must set non-zero features
+ * to be a subset (or all) of: VM_UFFD_MISSING|WP|MINOR.
+ */
+ unsigned long uffd_features;
+ /**
+ * @uffd_ioctls: ioctls supported in bitmask.
+ *
+ * Userfaultfd ioctls supported by the module. Below will always
+ * be supported by default whenever a module provides vm_uffd_ops:
+ *
+ * _UFFDIO_API, _UFFDIO_REGISTER, _UFFDIO_UNREGISTER, _UFFDIO_WAKE
+ *
+ * The module needs to provide all the rest optionally supported
+ * ioctls. For example, when VM_UFFD_MISSING was supported,
+ * _UFFDIO_COPY must be supported as ioctl, while _UFFDIO_ZEROPAGE
+ * is optional.
+ */
+ unsigned long uffd_ioctls;
+ /**
+ * uffd_get_folio: Handler to resolve UFFDIO_CONTINUE request.
+ *
+ * @inode: the inode for folio lookup
+ * @pgoff: the pgoff of the folio
+ * @folio: returned folio pointer
+ *
+ * Return: zero if succeeded, negative for errors.
+ */
+ int (*uffd_get_folio)(struct inode *inode, pgoff_t pgoff,
+ struct folio **folio);
+ /**
+ * uffd_copy: Handler to resolve UFFDIO_COPY|ZEROPAGE request.
+ *
+ * @dst_pmd: target pmd to resolve page fault
+ * @dst_vma: target vma
+ * @dst_addr: target virtual address
+ * @src_addr: source address to copy from
+ * @flags: userfaultfd request flags
+ * @foliop: previously allocated folio
+ *
+ * Return: zero if succeeded, negative for errors.
+ */
+ int (*uffd_copy)(pmd_t *dst_pmd, struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ uffd_flags_t flags, struct folio **foliop);
+};
+typedef struct vm_uffd_ops vm_uffd_ops;
+
#define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1)
#define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr))
#define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr))
--
2.49.0
On Fri, Jun 27, 2025 at 11:46:52AM -0400, Peter Xu wrote: > Introduce a generic userfaultfd API for vm_operations_struct, so that one > vma, especially when as a module, can support userfaults without modifying > the core files. More importantly, when the module can be compiled out of > the kernel. I'm not sure I understand this. One VMA 'especially when as a module'? This whole sentence is really unclear. So it seems to me that you want to be able to allow more than just: - anonymous memory - shmem - hugetlb To support userfaultfd correct? > > So, instead of having core mm referencing modules that may not ever exist, > we need to have modules opt-in on core mm hooks instead. OK this sounds reasonable. > > After this API applied, if a module wants to support userfaultfd, the > module should only need to touch its own file and properly define > vm_uffd_ops, instead of changing anything in core mm. Yes this also sounds reasonable, _as long as_ :) the module authors are aware of any conditions that might exist in the VMA that might be odd or confusing. For instance, if locks need to be held etc. I am generally slightly wary of us giving VMAs in hooks because people do crazy things with them that > > Note that such API will not work for anonymous. Core mm will process > anonymous memory separately for userfault operations like before. Right. > > This patch only introduces the API alone so that we can start to move > existing users over but without breaking them. Sounds sensible. > > Currently the uffd_copy() API is almost designed to be the simplistic with > minimum mm changes to move over to the API. A good approach. > > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > include/linux/mm.h | 9 ++++++ > include/linux/userfaultfd_k.h | 52 +++++++++++++++++++++++++++++++++++ > 2 files changed, 61 insertions(+) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index ef40f68c1183..6a5447bd43fd 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -576,6 +576,8 @@ struct vm_fault { > */ > }; > > +struct vm_uffd_ops; > + > /* > * These are the virtual MM functions - opening of an area, closing and > * unmapping it (needed to keep files on disk up-to-date etc), pointer > @@ -653,6 +655,13 @@ struct vm_operations_struct { > */ > struct page *(*find_special_page)(struct vm_area_struct *vma, > unsigned long addr); > +#ifdef CONFIG_USERFAULTFD > + /* > + * Userfaultfd related ops. Modules need to define this to support > + * userfaultfd. > + */ > + const struct vm_uffd_ops *userfaultfd_ops; > +#endif > }; This shouldn't go in vm_ops like this. You're basically changing a fundamental convention here - that _ops structs define handlers, now you can have somehow nested ones? It seems more appropriate to have something like: struct vm_uffd_ops *(*get_uffd_ops)(struct vm_area_struct *vma); This then matches e.g. mempolicy's get_policy handler. > > #ifdef CONFIG_NUMA_BALANCING > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h > index df85330bcfa6..c9a093c4502b 100644 > --- a/include/linux/userfaultfd_k.h > +++ b/include/linux/userfaultfd_k.h > @@ -92,6 +92,58 @@ enum mfill_atomic_mode { > NR_MFILL_ATOMIC_MODES, > }; > > +/* VMA userfaultfd operations */ Are we sure this should be operating at the VMA level? I mean are the operations going to be operating on VMAs or VM fault structs? If not, then this surely belongs to the file operations no? > +struct vm_uffd_ops { I'd comment on the naming, but 'vm_operations_struct' is so bad that it seems we have no sensible convention anyway so this is fine :P > + /** > + * @uffd_features: features supported in bitmask. > + * > + * When the ops is defined, the driver must set non-zero features I don't think the 'when the ops are defined' bit is necessray here, you're commenting on the ops here, you can safely assume they're defined. So I'd just say 'must be a subset (or all) of: VM_UFFD_MISSING|WP|MINOR.' > + * to be a subset (or all) of: VM_UFFD_MISSING|WP|MINOR. > + */ > + unsigned long uffd_features; > + /** > + * @uffd_ioctls: ioctls supported in bitmask. > + * > + * Userfaultfd ioctls supported by the module. Below will always > + * be supported by default whenever a module provides vm_uffd_ops: > + * > + * _UFFDIO_API, _UFFDIO_REGISTER, _UFFDIO_UNREGISTER, _UFFDIO_WAKE > + * > + * The module needs to provide all the rest optionally supported > + * ioctls. For example, when VM_UFFD_MISSING was supported, > + * _UFFDIO_COPY must be supported as ioctl, while _UFFDIO_ZEROPAGE > + * is optional. I'm not sure why we need this field? Isn't this implied by whether handlers are set or NULL? > + */ > + unsigned long uffd_ioctls; > + /** > + * uffd_get_folio: Handler to resolve UFFDIO_CONTINUE request. > + * > + * @inode: the inode for folio lookup > + * @pgoff: the pgoff of the folio > + * @folio: returned folio pointer > + * > + * Return: zero if succeeded, negative for errors. > + */ > + int (*uffd_get_folio)(struct inode *inode, pgoff_t pgoff, > + struct folio **folio); Not sure uufd_ prefix is needed in a struct that's obviously about uffd already? I'd strip that. Also this name is really confusing, I think it should contain continue in some form, such as 'handle_continue()'? This really feels like it shouldn't be a VMA function as you're dealing with inode, pgoff, and folio and not VMA at all? > + /** > + * uffd_copy: Handler to resolve UFFDIO_COPY|ZEROPAGE request. > + * > + * @dst_pmd: target pmd to resolve page fault > + * @dst_vma: target vma > + * @dst_addr: target virtual address > + * @src_addr: source address to copy from > + * @flags: userfaultfd request flags > + * @foliop: previously allocated folio > + * > + * Return: zero if succeeded, negative for errors. Can you please ensure you put details as to VMA lock state here. Uffd has some very tricky handling around stuff like this. > + */ > + int (*uffd_copy)(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, > + unsigned long dst_addr, unsigned long src_addr, > + uffd_flags_t flags, struct folio **foliop); Do we not need a uffd_ctx parameter here? It seems like we're assuming a _lot_ of mm understanding in the underlying driver here. I'm not sure it's really normal to be handing around page table state and folios etc. to a driver like this, this is really... worrying to me. This feels like you're trying to put mm functionality outside of mm? What if we change how we handle page tables in future, or the locking changes? We might not know to go and update x, y, z driver. This is concerning. > +}; > +typedef struct vm_uffd_ops vm_uffd_ops; > + > #define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1) > #define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr)) > #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr)) > -- > 2.49.0 >
On Mon, Jun 30, 2025 at 3:16 AM Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > > On Fri, Jun 27, 2025 at 11:46:52AM -0400, Peter Xu wrote: > > Introduce a generic userfaultfd API for vm_operations_struct, so that one > > vma, especially when as a module, can support userfaults without modifying > > the core files. More importantly, when the module can be compiled out of > > the kernel. > > I'm not sure I understand this. One VMA 'especially when as a module'? > > This whole sentence is really unclear. > > So it seems to me that you want to be able to allow more than just: > > - anonymous memory > - shmem > - hugetlb > > To support userfaultfd correct? > > > > > So, instead of having core mm referencing modules that may not ever exist, > > we need to have modules opt-in on core mm hooks instead. > > OK this sounds reasonable. > > > > > After this API applied, if a module wants to support userfaultfd, the > > module should only need to touch its own file and properly define > > vm_uffd_ops, instead of changing anything in core mm. > > Yes this also sounds reasonable, _as long as_ :) the module authors are > aware of any conditions that might exist in the VMA that might be odd or > confusing. > > For instance, if locks need to be held etc. > > I am generally slightly wary of us giving VMAs in hooks because people do > crazy things with them that > > > > > Note that such API will not work for anonymous. Core mm will process > > anonymous memory separately for userfault operations like before. > > Right. > > > > > This patch only introduces the API alone so that we can start to move > > existing users over but without breaking them. > > Sounds sensible. > > > > > Currently the uffd_copy() API is almost designed to be the simplistic with > > minimum mm changes to move over to the API. > > A good approach. > > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > --- > > include/linux/mm.h | 9 ++++++ > > include/linux/userfaultfd_k.h | 52 +++++++++++++++++++++++++++++++++++ > > 2 files changed, 61 insertions(+) > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index ef40f68c1183..6a5447bd43fd 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -576,6 +576,8 @@ struct vm_fault { > > */ > > }; > > > > +struct vm_uffd_ops; > > + > > /* > > * These are the virtual MM functions - opening of an area, closing and > > * unmapping it (needed to keep files on disk up-to-date etc), pointer > > @@ -653,6 +655,13 @@ struct vm_operations_struct { > > */ > > struct page *(*find_special_page)(struct vm_area_struct *vma, > > unsigned long addr); > > +#ifdef CONFIG_USERFAULTFD > > + /* > > + * Userfaultfd related ops. Modules need to define this to support > > + * userfaultfd. > > + */ > > + const struct vm_uffd_ops *userfaultfd_ops; > > +#endif > > }; > > This shouldn't go in vm_ops like this. You're basically changing a > fundamental convention here - that _ops structs define handlers, now you > can have somehow nested ones? > > It seems more appropriate to have something like: > > struct vm_uffd_ops *(*get_uffd_ops)(struct vm_area_struct *vma); > > This then matches e.g. mempolicy's get_policy handler. > > > > > #ifdef CONFIG_NUMA_BALANCING > > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h > > index df85330bcfa6..c9a093c4502b 100644 > > --- a/include/linux/userfaultfd_k.h > > +++ b/include/linux/userfaultfd_k.h > > @@ -92,6 +92,58 @@ enum mfill_atomic_mode { > > NR_MFILL_ATOMIC_MODES, > > }; > > > > +/* VMA userfaultfd operations */ > > Are we sure this should be operating at the VMA level? > > I mean are the operations going to be operating on VMAs or VM fault > structs? If not, then this surely belongs to the file operations no? > > > +struct vm_uffd_ops { > > I'd comment on the naming, but 'vm_operations_struct' is so bad that it > seems we have no sensible convention anyway so this is fine :P > > > + /** > > + * @uffd_features: features supported in bitmask. > > + * > > + * When the ops is defined, the driver must set non-zero features > > I don't think the 'when the ops are defined' bit is necessray here, you're > commenting on the ops here, you can safely assume they're defined. > > So I'd just say 'must be a subset (or all) of: VM_UFFD_MISSING|WP|MINOR.' > > > + * to be a subset (or all) of: VM_UFFD_MISSING|WP|MINOR. > > + */ > > + unsigned long uffd_features; > > + /** > > + * @uffd_ioctls: ioctls supported in bitmask. > > + * > > + * Userfaultfd ioctls supported by the module. Below will always > > + * be supported by default whenever a module provides vm_uffd_ops: > > + * > > + * _UFFDIO_API, _UFFDIO_REGISTER, _UFFDIO_UNREGISTER, _UFFDIO_WAKE > > + * > > + * The module needs to provide all the rest optionally supported > > + * ioctls. For example, when VM_UFFD_MISSING was supported, > > + * _UFFDIO_COPY must be supported as ioctl, while _UFFDIO_ZEROPAGE > > + * is optional. > > I'm not sure why we need this field? Isn't this implied by whether handlers > are set or NULL? > > > + */ > > + unsigned long uffd_ioctls; > > + /** > > + * uffd_get_folio: Handler to resolve UFFDIO_CONTINUE request. > > + * > > + * @inode: the inode for folio lookup > > + * @pgoff: the pgoff of the folio > > + * @folio: returned folio pointer > > + * > > + * Return: zero if succeeded, negative for errors. > > + */ > > + int (*uffd_get_folio)(struct inode *inode, pgoff_t pgoff, > > + struct folio **folio); > > Not sure uufd_ prefix is needed in a struct that's obviously about uffd > already? I'd strip that. > > Also this name is really confusing, I think it should contain continue in > some form, such as 'handle_continue()'? > > This really feels like it shouldn't be a VMA function as you're dealing > with inode, pgoff, and folio and not VMA at all? > > > > + /** > > + * uffd_copy: Handler to resolve UFFDIO_COPY|ZEROPAGE request. > > + * > > + * @dst_pmd: target pmd to resolve page fault > > + * @dst_vma: target vma > > + * @dst_addr: target virtual address > > + * @src_addr: source address to copy from > > + * @flags: userfaultfd request flags > > + * @foliop: previously allocated folio > > + * > > + * Return: zero if succeeded, negative for errors. > > Can you please ensure you put details as to VMA lock state here. Uffd has > some very tricky handling around stuff like this. > > > + */ > > + int (*uffd_copy)(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, > > + unsigned long dst_addr, unsigned long src_addr, > > + uffd_flags_t flags, struct folio **foliop); > > Do we not need a uffd_ctx parameter here? > > It seems like we're assuming a _lot_ of mm understanding in the underlying > driver here. > > I'm not sure it's really normal to be handing around page table state and > folios etc. to a driver like this, this is really... worrying to me. > > This feels like you're trying to put mm functionality outside of mm? To second that, two things stick out for me here: 1. uffd_copy and uffd_get_folio seem to be at different abstraction levels. uffd_copy is almost the entire copy operation for VM_SHARED VMAs while uffd_get_folio is a small part of the continue operation. 2. shmem_mfill_atomic_pte which becomes uffd_copy for shmem in the last patch is quite a complex function which itself calls some IMO pretty internal functions like mfill_atomic_install_pte(). Expecting modules to implement such functionality seems like a stretch to me but maybe this is for some specialized modules which are written by mm experts only? > > What if we change how we handle page tables in future, or the locking > changes? We might not know to go and update x, y, z driver. > > This is concerning. > > > +}; > > +typedef struct vm_uffd_ops vm_uffd_ops; > > + > > #define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1) > > #define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr)) > > #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr)) > > -- > > 2.49.0 > >
On Tue, Jul 01, 2025 at 10:04:28AM -0700, Suren Baghdasaryan wrote: > On Mon, Jun 30, 2025 at 3:16 AM Lorenzo Stoakes > <lorenzo.stoakes@oracle.com> wrote: > > > > It seems like we're assuming a _lot_ of mm understanding in the underlying > > driver here. > > > > I'm not sure it's really normal to be handing around page table state and > > folios etc. to a driver like this, this is really... worrying to me. > > > > This feels like you're trying to put mm functionality outside of mm? > > To second that, two things stick out for me here: > 1. uffd_copy and uffd_get_folio seem to be at different abstraction > levels. uffd_copy is almost the entire copy operation for VM_SHARED > VMAs while uffd_get_folio is a small part of the continue operation. > 2. shmem_mfill_atomic_pte which becomes uffd_copy for shmem in the > last patch is quite a complex function which itself calls some IMO > pretty internal functions like mfill_atomic_install_pte(). Expecting > modules to implement such functionality seems like a stretch to me but > maybe this is for some specialized modules which are written by mm > experts only? Largely shmem_mfill_atomic_pte() differs from anonymous memory version (mfill_atomic_pte_copy()) by the way the allocated folio is accounted and whether it's added to the page cache. So instead of uffd_copy(...) we might add int (*folio_alloc)(struct vm_area_struct *vma, unsigned long dst_addr); void (*folio_release)(struct vm_area_struct *vma, struct folio *folio); and then use them in mfill_atomic_pte_copy(): diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index bc473ad21202..6bad0dd70d3d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -247,8 +247,11 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, if (!*foliop) { ret = -ENOMEM; - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, - dst_addr); + if (uffd_ops(dst_vma) && uffd_ops(dst_vma)->folio_alloc) + folio = uffd_ops(dst_vma)->folio_alloc(dst_vma, dst_addr); + else + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, + dst_addr); if (!folio) goto out; @@ -307,6 +310,8 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, return ret; out_release: folio_put(folio); + if (uffd_ops(dst_vma) && uffd_ops(dst_vma)->folio_release) + uffd_ops(dst_vma)->folio_release(dst_vma, folio); goto out; } -- Sincerely yours, Mike.
On Wed, Jul 02, 2025 at 09:16:31PM +0300, Mike Rapoport wrote: > On Tue, Jul 01, 2025 at 10:04:28AM -0700, Suren Baghdasaryan wrote: > > On Mon, Jun 30, 2025 at 3:16 AM Lorenzo Stoakes > > <lorenzo.stoakes@oracle.com> wrote: > > > > > > It seems like we're assuming a _lot_ of mm understanding in the underlying > > > driver here. > > > > > > I'm not sure it's really normal to be handing around page table state and > > > folios etc. to a driver like this, this is really... worrying to me. > > > > > > This feels like you're trying to put mm functionality outside of mm? > > > > To second that, two things stick out for me here: > > 1. uffd_copy and uffd_get_folio seem to be at different abstraction > > levels. uffd_copy is almost the entire copy operation for VM_SHARED > > VMAs while uffd_get_folio is a small part of the continue operation. > > 2. shmem_mfill_atomic_pte which becomes uffd_copy for shmem in the > > last patch is quite a complex function which itself calls some IMO > > pretty internal functions like mfill_atomic_install_pte(). Expecting > > modules to implement such functionality seems like a stretch to me but > > maybe this is for some specialized modules which are written by mm > > experts only? > > Largely shmem_mfill_atomic_pte() differs from anonymous memory version > (mfill_atomic_pte_copy()) by the way the allocated folio is accounted and > whether it's added to the page cache. So instead of uffd_copy(...) we might > add > > int (*folio_alloc)(struct vm_area_struct *vma, unsigned long dst_addr); > void (*folio_release)(struct vm_area_struct *vma, struct folio *folio); Thanks for digging into this, Mike. It's just that IMHO it may not be enough.. I actually tried to think about a more complicated API, but more I thought of that, more it looked like an overkill. I can list something here to show why I chose the simplest solution with uffd_copy() as of now. Firstly, see shmem_inode_acct_blocks() at the entrance: that's shmem accounting we need to do before allocations, and with/without allocations. That accounting can't be put into folio_alloc() yet even if we'll have one, because we could have the folio allocated when reaching here (that is, when *foliop != NULL). That was a very delicated decision of us to do shmem accounting separately in 2016: https://lore.kernel.org/all/20161216144821.5183-37-aarcange@redhat.com/ Then, there's also the complexity on how the page cache should be managed for any mem type. For shmem, folio was only injected right before the pgtable installations. We can't inject it when folio_alloc() because then others can fault-in without data populated. It means we at least need one more API to do page cache injections for the folio just got allocated from folio_alloc(). We also may want to have different treatment on how the folio flags are setup. It may not always happen in folio_alloc(). E.g. for shmem right now we do this right before the page cache injections: VM_BUG_ON(folio_test_locked(folio)); VM_BUG_ON(folio_test_swapbacked(folio)); __folio_set_locked(folio); __folio_set_swapbacked(folio); __folio_mark_uptodate(folio); We may not want to do exactly the same for all the rest mem types. E.g. we likely don't want to set swapbacked for guest-memfd folios. We may need one more API to do it. Then if to think about failure path, when we have the question above on shmem acct issue, we may want to have yet another post_failure hook doing shmem_inode_unacct_blocks() properly for shmem.. maybe we don't need that for guest-memfd, but we still need that for shmem to properly unacct when folio allocation succeeded, but copy_from_user failed somehow. Then the question is, do we really need all these fuss almost for nothing? Note that if we want, IIUC we can still change this in the future. The initial isolation like this series would still be good to land earlier; we don't even plan to support MISSING for guest-memfd in the near future, but only MINOR mode for now. We don't necessarily need to work out MISSING immediately to move on useful features landing Linux. Even if we'll have it for guest-memfd, it shouldn't be a challenge to maintain both. This is just not a contract we need to sign forever yet. Hope above clarifies a bit on why I chose the simplest solution as of now. I also don't like this API, as I mentioned in the cover letter. It's really a trade-off I made at least for now the best I can come up with. Said that, if any of us has better solution, please shoot. I'm always open to better alternatives. Thanks, -- Peter Xu
On Wed, Jul 2, 2025 at 1:23 PM Peter Xu <peterx@redhat.com> wrote: > > On Wed, Jul 02, 2025 at 09:16:31PM +0300, Mike Rapoport wrote: > > On Tue, Jul 01, 2025 at 10:04:28AM -0700, Suren Baghdasaryan wrote: > > > On Mon, Jun 30, 2025 at 3:16 AM Lorenzo Stoakes > > > <lorenzo.stoakes@oracle.com> wrote: > > > > > > > > It seems like we're assuming a _lot_ of mm understanding in the underlying > > > > driver here. > > > > > > > > I'm not sure it's really normal to be handing around page table state and > > > > folios etc. to a driver like this, this is really... worrying to me. > > > > > > > > This feels like you're trying to put mm functionality outside of mm? > > > > > > To second that, two things stick out for me here: > > > 1. uffd_copy and uffd_get_folio seem to be at different abstraction > > > levels. uffd_copy is almost the entire copy operation for VM_SHARED > > > VMAs while uffd_get_folio is a small part of the continue operation. > > > 2. shmem_mfill_atomic_pte which becomes uffd_copy for shmem in the > > > last patch is quite a complex function which itself calls some IMO > > > pretty internal functions like mfill_atomic_install_pte(). Expecting > > > modules to implement such functionality seems like a stretch to me but > > > maybe this is for some specialized modules which are written by mm > > > experts only? > > > > Largely shmem_mfill_atomic_pte() differs from anonymous memory version > > (mfill_atomic_pte_copy()) by the way the allocated folio is accounted and > > whether it's added to the page cache. So instead of uffd_copy(...) we might > > add > > > > int (*folio_alloc)(struct vm_area_struct *vma, unsigned long dst_addr); > > void (*folio_release)(struct vm_area_struct *vma, struct folio *folio); > > Thanks for digging into this, Mike. It's just that IMHO it may not be > enough.. > > I actually tried to think about a more complicated API, but more I thought > of that, more it looked like an overkill. I can list something here to > show why I chose the simplest solution with uffd_copy() as of now. TBH below does not sound like an overkill to me for keeping mm parts to itself without over-exposing them to modules. > > Firstly, see shmem_inode_acct_blocks() at the entrance: that's shmem > accounting we need to do before allocations, and with/without allocations. Ok, this results in an additional folio_prealloc() hook. > That accounting can't be put into folio_alloc() yet even if we'll have one, > because we could have the folio allocated when reaching here (that is, when > *foliop != NULL). That was a very delicated decision of us to do shmem > accounting separately in 2016: > > https://lore.kernel.org/all/20161216144821.5183-37-aarcange@redhat.com/ > > Then, there's also the complexity on how the page cache should be managed > for any mem type. For shmem, folio was only injected right before the > pgtable installations. We can't inject it when folio_alloc() because then > others can fault-in without data populated. It means we at least need one > more API to do page cache injections for the folio just got allocated from > folio_alloc(). folio_add_to_cache() hook? > > We also may want to have different treatment on how the folio flags are > setup. It may not always happen in folio_alloc(). E.g. for shmem right > now we do this right before the page cache injections: > > VM_BUG_ON(folio_test_locked(folio)); > VM_BUG_ON(folio_test_swapbacked(folio)); > __folio_set_locked(folio); > __folio_set_swapbacked(folio); > __folio_mark_uptodate(folio); > > We may not want to do exactly the same for all the rest mem types. E.g. we > likely don't want to set swapbacked for guest-memfd folios. We may need > one more API to do it. Can we do that inside folio_add_to_cache() hook before doing the injection? > > Then if to think about failure path, when we have the question above on > shmem acct issue, we may want to have yet another post_failure hook doing > shmem_inode_unacct_blocks() properly for shmem.. maybe we don't need that > for guest-memfd, but we still need that for shmem to properly unacct when > folio allocation succeeded, but copy_from_user failed somehow. Failure handling hook. > > Then the question is, do we really need all these fuss almost for nothing? If that helps to keep modules simpler and mm details contained inside the core kernel I think it is worth doing. I imagine if the shmem was a module then implementing the current API would require exporting functions like mfill_atomic_install_pte(). That seems like over-exposure to me. And if we can avoid all the locking and refcounting that Liam mentioned that would definitely be worth it IMHO. > > Note that if we want, IIUC we can still change this in the future. The > initial isolation like this series would still be good to land earlier; we > don't even plan to support MISSING for guest-memfd in the near future, but > only MINOR mode for now. We don't necessarily need to work out MISSING > immediately to move on useful features landing Linux. Even if we'll have > it for guest-memfd, it shouldn't be a challenge to maintain both. This is > just not a contract we need to sign forever yet. > > Hope above clarifies a bit on why I chose the simplest solution as of > now. I also don't like this API, as I mentioned in the cover letter. It's > really a trade-off I made at least for now the best I can come up with. > > Said that, if any of us has better solution, please shoot. I'm always open > to better alternatives. I didn't explore this code as deep as you have done and therefore might not see all the pitfalls but looks like you already considered an alternative which does sound better to me. What are the drawbacks that I might be missing? Thanks, Suren. > > Thanks, > > -- > Peter Xu >
On Thu, Jul 03, 2025 at 08:01:12AM -0700, Suren Baghdasaryan wrote: > On Wed, Jul 2, 2025 at 1:23 PM Peter Xu <peterx@redhat.com> wrote: > > > > On Wed, Jul 02, 2025 at 09:16:31PM +0300, Mike Rapoport wrote: > > > On Tue, Jul 01, 2025 at 10:04:28AM -0700, Suren Baghdasaryan wrote: > > > > On Mon, Jun 30, 2025 at 3:16 AM Lorenzo Stoakes > > > > <lorenzo.stoakes@oracle.com> wrote: > > > > > > > > > > It seems like we're assuming a _lot_ of mm understanding in the underlying > > > > > driver here. > > > > > > > > > > I'm not sure it's really normal to be handing around page table state and > > > > > folios etc. to a driver like this, this is really... worrying to me. > > > > > > > > > > This feels like you're trying to put mm functionality outside of mm? > > > > > > > > To second that, two things stick out for me here: > > > > 1. uffd_copy and uffd_get_folio seem to be at different abstraction > > > > levels. uffd_copy is almost the entire copy operation for VM_SHARED > > > > VMAs while uffd_get_folio is a small part of the continue operation. > > > > 2. shmem_mfill_atomic_pte which becomes uffd_copy for shmem in the > > > > last patch is quite a complex function which itself calls some IMO > > > > pretty internal functions like mfill_atomic_install_pte(). Expecting > > > > modules to implement such functionality seems like a stretch to me but > > > > maybe this is for some specialized modules which are written by mm > > > > experts only? > > > > > > Largely shmem_mfill_atomic_pte() differs from anonymous memory version > > > (mfill_atomic_pte_copy()) by the way the allocated folio is accounted and > > > whether it's added to the page cache. So instead of uffd_copy(...) we might > > > add > > > > > > int (*folio_alloc)(struct vm_area_struct *vma, unsigned long dst_addr); > > > void (*folio_release)(struct vm_area_struct *vma, struct folio *folio); > > > > Thanks for digging into this, Mike. It's just that IMHO it may not be > > enough.. > > > > I actually tried to think about a more complicated API, but more I thought > > of that, more it looked like an overkill. I can list something here to > > show why I chose the simplest solution with uffd_copy() as of now. > > TBH below does not sound like an overkill to me for keeping mm parts > to itself without over-exposing them to modules. > > > > > Firstly, see shmem_inode_acct_blocks() at the entrance: that's shmem > > accounting we need to do before allocations, and with/without allocations. > > Ok, this results in an additional folio_prealloc() hook. > > > That accounting can't be put into folio_alloc() yet even if we'll have one, > > because we could have the folio allocated when reaching here (that is, when > > *foliop != NULL). That was a very delicated decision of us to do shmem > > accounting separately in 2016: > > > > https://lore.kernel.org/all/20161216144821.5183-37-aarcange@redhat.com/ > > > > Then, there's also the complexity on how the page cache should be managed > > for any mem type. For shmem, folio was only injected right before the > > pgtable installations. We can't inject it when folio_alloc() because then > > others can fault-in without data populated. It means we at least need one > > more API to do page cache injections for the folio just got allocated from > > folio_alloc(). > > folio_add_to_cache() hook? > > > > > We also may want to have different treatment on how the folio flags are > > setup. It may not always happen in folio_alloc(). E.g. for shmem right > > now we do this right before the page cache injections: > > > > VM_BUG_ON(folio_test_locked(folio)); > > VM_BUG_ON(folio_test_swapbacked(folio)); > > __folio_set_locked(folio); > > __folio_set_swapbacked(folio); > > __folio_mark_uptodate(folio); > > > > We may not want to do exactly the same for all the rest mem types. E.g. we > > likely don't want to set swapbacked for guest-memfd folios. We may need > > one more API to do it. > > Can we do that inside folio_add_to_cache() hook before doing the injection? The folio can be freed outside this function by userfaultfd callers, so I wonder if it'll crash the kernel if mm sees a folio locked while being freed. > > > > > Then if to think about failure path, when we have the question above on > > shmem acct issue, we may want to have yet another post_failure hook doing > > shmem_inode_unacct_blocks() properly for shmem.. maybe we don't need that > > for guest-memfd, but we still need that for shmem to properly unacct when > > folio allocation succeeded, but copy_from_user failed somehow. > > Failure handling hook. > > > > > Then the question is, do we really need all these fuss almost for nothing? > > If that helps to keep modules simpler and mm details contained inside > the core kernel I think it is worth doing. I imagine if the shmem was > a module then implementing the current API would require exporting > functions like mfill_atomic_install_pte(). That seems like > over-exposure to me. And if we can avoid all the locking and > refcounting that Liam mentioned that would definitely be worth it > IMHO. > > > > > Note that if we want, IIUC we can still change this in the future. The > > initial isolation like this series would still be good to land earlier; we > > don't even plan to support MISSING for guest-memfd in the near future, but > > only MINOR mode for now. We don't necessarily need to work out MISSING > > immediately to move on useful features landing Linux. Even if we'll have > > it for guest-memfd, it shouldn't be a challenge to maintain both. This is > > just not a contract we need to sign forever yet. [1] > > > > Hope above clarifies a bit on why I chose the simplest solution as of > > now. I also don't like this API, as I mentioned in the cover letter. It's > > really a trade-off I made at least for now the best I can come up with. > > > > Said that, if any of us has better solution, please shoot. I'm always open > > to better alternatives. > > I didn't explore this code as deep as you have done and therefore > might not see all the pitfalls but looks like you already considered > an alternative which does sound better to me. What are the drawbacks > that I might be missing? It'll be a complex API from another angle, that we'll need 5-6 APIs just to implement uffd missing mode. Meanwhile it'll also already start to introduce those function jumps into shmem even if one would be enough to decouple it. As I mentioned above, I'm also not against doing it, but IMHO that does not need to be done right now [1], as currently it looks to me guest-memfd may not really need missing mode traps as of now, and I am actually not sure whether it will. We may not want to introduce 5-6 APIs with no further user. We can also always discuss a proper API when the demand arrives. If uffd_copy is so concerning to people, there's also another alternative which is to make vm_uffd_ops only support traps except MISSING. uffd_copy is only about missing traps. Then we can drop uffd_copy API, but the API itself is then broken on its own. I still feel like we're over-worried on this. For OOT drivers I never cared breaking anything. For in-tree ones, this discussion can really happen when there's a need to. And at that time we can also have a look at the impl using uffd_copy(), maybe it'll not be that bad either. It seems to me we don't necessarily need to figure this out right now. IMHO we can do it two-steps in worst case. Thanks, -- Peter Xu
On Thu, Jul 03, 2025 at 11:45:40AM -0400, Peter Xu wrote: > I still feel like we're over-worried on this. For OOT drivers I never > cared breaking anything. For in-tree ones, this discussion can really > happen when there's a need to. And at that time we can also have a look at > the impl using uffd_copy(), maybe it'll not be that bad either. It seems > to me we don't necessarily need to figure this out right now. IMHO we can > do it two-steps in worst case. This kind of approach in relation to f_op->mmap() is precisely how I ended up being cc'd on an embargoed report of a critical zero-day security flaw. So I think if anything there is under-worrying going on here.
On Tue, Jul 01, 2025 at 10:04:28AM -0700, Suren Baghdasaryan wrote: > On Mon, Jun 30, 2025 at 3:16 AM Lorenzo Stoakes > <lorenzo.stoakes@oracle.com> wrote: > > This feels like you're trying to put mm functionality outside of mm? > > To second that, two things stick out for me here: > 1. uffd_copy and uffd_get_folio seem to be at different abstraction > levels. uffd_copy is almost the entire copy operation for VM_SHARED > VMAs while uffd_get_folio is a small part of the continue operation. > 2. shmem_mfill_atomic_pte which becomes uffd_copy for shmem in the > last patch is quite a complex function which itself calls some IMO > pretty internal functions like mfill_atomic_install_pte(). Expecting > modules to implement such functionality seems like a stretch to me but > maybe this is for some specialized modules which are written by mm > experts only? To echo what Liam said - I don't think we can truly rely on expertise here (we make enough mistakes in core mm for that to be a dubious proposition even tere :) and even if experts were involved, having core mm functionality outside of core mm carries significant risk - we are constantly changing things, including assumptions around sensitive topics such as locking (think VMA locking) - having code elsewhere significantly increases the risk of missing things. I am also absolutely, to be frank, not going to accept us EXPORT()'ing anything core. Page table manipulation really must rely in core mm and arch code only, it is easily some of the most subtle, confusing and dangerous code in mm (I have spent subtantial hours banging my head against it recently), and again - subject to constant change. But to come back to Liam's comments and to reiterate what I was referring to earlier, even permitting drivers to have access to VMAs is _highly_ problematic and has resulted in very real bugs and subtle issues that took many hours, much stress + gnashing of teeth to adress. The very thing of: xxx <hand off sensitive mutable state X, Y, Z to driver> yyy Means that between xxx and yyy we can make literally no assumptions about what just happened to all handed off state. A single instance of this has caused mayhem, if we did this in such a way as to affect the _many_ uffd hooks we could have a realy serious problem. So - what seems really positive about this series is the _generalisation_ and _abstraction_ of uffd functionality. That is something I appreciate and I think uffd sorely needs, in fact if we could find a way to not need to do: if (some_uffd_predicate()) some_uffd_specific_fn(); That'd be incredible. So I think the answer here is to do something like this, and to keep all the mm-specific code in core mm. Thanks, Lorenzo
On Wed, Jul 02, 2025 at 04:56:04PM +0100, Lorenzo Stoakes wrote: > I am also absolutely, to be frank, not going to accept us EXPORT()'ing > anything core. Could you be explicit on what you're absolutely against? Is that the vm_uffd_ops (or any form of it) being exported, or anything else? Thanks, -- Peter Xu
On Wed, Jul 02, 2025 at 04:24:25PM -0400, Peter Xu wrote: > On Wed, Jul 02, 2025 at 04:56:04PM +0100, Lorenzo Stoakes wrote: > > I am also absolutely, to be frank, not going to accept us EXPORT()'ing > > anything core. > > Could you be explicit on what you're absolutely against? Is that the > vm_uffd_ops (or any form of it) being exported, or anything else? page table manipulation core mm internals In fact I think we need to minimise as much exposure as posible. > > Thanks, > > -- > Peter Xu >
On 02/07/2025 16:56, Lorenzo Stoakes wrote: > On Tue, Jul 01, 2025 at 10:04:28AM -0700, Suren Baghdasaryan wrote: >> On Mon, Jun 30, 2025 at 3:16 AM Lorenzo Stoakes >> <lorenzo.stoakes@oracle.com> wrote: >>> This feels like you're trying to put mm functionality outside of mm? >> >> To second that, two things stick out for me here: >> 1. uffd_copy and uffd_get_folio seem to be at different abstraction >> levels. uffd_copy is almost the entire copy operation for VM_SHARED >> VMAs while uffd_get_folio is a small part of the continue operation. >> 2. shmem_mfill_atomic_pte which becomes uffd_copy for shmem in the >> last patch is quite a complex function which itself calls some IMO >> pretty internal functions like mfill_atomic_install_pte(). Expecting >> modules to implement such functionality seems like a stretch to me but >> maybe this is for some specialized modules which are written by mm >> experts only? > > To echo what Liam said - I don't think we can truly rely on expertise here > (we make enough mistakes in core mm for that to be a dubious proposition > even tere :) and even if experts were involved, having core mm > functionality outside of core mm carries significant risk - we are > constantly changing things, including assumptions around sensitive topics > such as locking (think VMA locking) - having code elsewhere significantly > increases the risk of missing things. > > I am also absolutely, to be frank, not going to accept us EXPORT()'ing > anything core. > > Page table manipulation really must rely in core mm and arch code only, it > is easily some of the most subtle, confusing and dangerous code in mm (I > have spent subtantial hours banging my head against it recently), and again > - subject to constant change. > > But to come back to Liam's comments and to reiterate what I was referring > to earlier, even permitting drivers to have access to VMAs is _highly_ > problematic and has resulted in very real bugs and subtle issues that took > many hours, much stress + gnashing of teeth to adress. The main target of this change is the implementation of UFFD for KVM/guest_memfd (examples: [1], [2]) to avoid bringing KVM-specific code into the mm codebase. We usually mean KVM by the "drivers" in this context, and it is already somewhat "knowledgeable" of the mm. I don't think there are existing use cases for other drivers to implement this at the moment. Although I can't see new exports in this series, there is now a way to limit exports to particular modules [3]. Would it help if we only do it for KVM initially (if/when actually needed)? [1] https://lore.kernel.org/all/114133f5-0282-463d-9d65-3143aa658806@amazon.com/ [2] https://lore.kernel.org/all/7666ee96-6f09-4dc1-8cb2-002a2d2a29cf@amazon.com/ [3] https://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git/commit/?h=kbuild&id=707f853d7fa3ce323a6875487890c213e34d81a0 Thanks, Nikita > > The very thing of: > > xxx > <hand off sensitive mutable state X, Y, Z to driver> > yyy > > Means that between xxx and yyy we can make literally no assumptions about > what just happened to all handed off state. A single instance of this has > caused mayhem, if we did this in such a way as to affect the _many_ uffd > hooks we could have a realy serious problem. > > So - what seems really positive about this series is the _generalisation_ > and _abstraction_ of uffd functionality. > > That is something I appreciate and I think uffd sorely needs, in fact if we > could find a way to not need to do: > > if (some_uffd_predicate()) > some_uffd_specific_fn(); > > That'd be incredible. > > So I think the answer here is to do something like this, and to keep all > the mm-specific code in core mm. > > Thanks, Lorenzo
* Nikita Kalyazin <kalyazin@amazon.com> [250702 13:08]: > > > On 02/07/2025 16:56, Lorenzo Stoakes wrote: > > On Tue, Jul 01, 2025 at 10:04:28AM -0700, Suren Baghdasaryan wrote: > > > On Mon, Jun 30, 2025 at 3:16 AM Lorenzo Stoakes > > > <lorenzo.stoakes@oracle.com> wrote: > > > > This feels like you're trying to put mm functionality outside of mm? > > > > > > To second that, two things stick out for me here: > > > 1. uffd_copy and uffd_get_folio seem to be at different abstraction > > > levels. uffd_copy is almost the entire copy operation for VM_SHARED > > > VMAs while uffd_get_folio is a small part of the continue operation. > > > 2. shmem_mfill_atomic_pte which becomes uffd_copy for shmem in the > > > last patch is quite a complex function which itself calls some IMO > > > pretty internal functions like mfill_atomic_install_pte(). Expecting > > > modules to implement such functionality seems like a stretch to me but > > > maybe this is for some specialized modules which are written by mm > > > experts only? > > > > To echo what Liam said - I don't think we can truly rely on expertise here > > (we make enough mistakes in core mm for that to be a dubious proposition > > even tere :) and even if experts were involved, having core mm > > functionality outside of core mm carries significant risk - we are > > constantly changing things, including assumptions around sensitive topics > > such as locking (think VMA locking) - having code elsewhere significantly > > increases the risk of missing things. > > > > I am also absolutely, to be frank, not going to accept us EXPORT()'ing > > anything core. > > > > Page table manipulation really must rely in core mm and arch code only, it > > is easily some of the most subtle, confusing and dangerous code in mm (I > > have spent subtantial hours banging my head against it recently), and again > > - subject to constant change. > > > > But to come back to Liam's comments and to reiterate what I was referring > > to earlier, even permitting drivers to have access to VMAs is _highly_ > > problematic and has resulted in very real bugs and subtle issues that took > > many hours, much stress + gnashing of teeth to adress. > > The main target of this change is the implementation of UFFD for > KVM/guest_memfd (examples: [1], [2]) to avoid bringing KVM-specific code > into the mm codebase. We usually mean KVM by the "drivers" in this context, > and it is already somewhat "knowledgeable" of the mm. I don't think there > are existing use cases for other drivers to implement this at the moment. > > Although I can't see new exports in this series, there is now a way to limit > exports to particular modules [3]. Would it help if we only do it for KVM > initially (if/when actually needed)? > > [1] > https://lore.kernel.org/all/114133f5-0282-463d-9d65-3143aa658806@amazon.com/ > [2] > https://lore.kernel.org/all/7666ee96-6f09-4dc1-8cb2-002a2d2a29cf@amazon.com/ > [3] https://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git/commit/?h=kbuild&id=707f853d7fa3ce323a6875487890c213e34d81a0 > That's because the entry point is from a function pointer, so [3] won't help at all. It is recreating the situation that existed for the vma through the vm_ops in mmap, but for uffd. And at a lower level (page tables). I do not want to relive that experience. We are not doing this. It is for the benefit of everyone that we are not doing this. We need to find another way. Regards, Liam
On Wed, Jul 02, 2025 at 05:24:02PM -0400, Liam R. Howlett wrote: > That's because the entry point is from a function pointer, so [3] won't > help at all. > > It is recreating the situation that existed for the vma through the > vm_ops in mmap, but for uffd. And at a lower level (page tables). I do not > want to relive that experience. > > We are not doing this. It is for the benefit of everyone that we are > not doing this. Is the vma issue about "allowing vma->vm_flags to be modified anywhere" issue? Or is there a pointer to the issue being discussed if not? > > We need to find another way. Could you suggest something? The minimum goal is to allow guest-memfd support minor faults. Thanks, -- Peter Xu
* Peter Xu <peterx@redhat.com> [250702 17:36]: > On Wed, Jul 02, 2025 at 05:24:02PM -0400, Liam R. Howlett wrote: > > That's because the entry point is from a function pointer, so [3] won't > > help at all. > > > > It is recreating the situation that existed for the vma through the > > vm_ops in mmap, but for uffd. And at a lower level (page tables). I do not > > want to relive that experience. > > > > We are not doing this. It is for the benefit of everyone that we are > > not doing this. > > Is the vma issue about "allowing vma->vm_flags to be modified anywhere" > issue? Or is there a pointer to the issue being discussed if not? The issue is passing pointers of structs that are protected by locks or ref counters into modules to do what they please. vma->vm_flags was an example of where we learned how wrong this can go. There is also the concern of the state of the folio on return from the callback. The error handling gets messy quick. Now, imagine we have something that gets a folio, but then we find a solution for contention of a lock or ref count (whatever is next), but it doesn't work because the mm code has been bleeding into random modules and we have no clue what that module is supposed to be doing, or we can't make the necessary change because this module will break userspace, or cause a performance decrease, or any other random thing that we cannot work around without rewriting (probably suboptimally) something we don't maintain. Again, these are examples of how this can go bad but not an exhaustive list by any means. So the issue is with allowing modules to play with the folio and page tables on their own. If this is outside the mm, we probably won't even be Cc'ed on modules that use it. And do we want to be Cc'ed on modules that want to use it? We will most likely be Cc'ed or emailed directly on the resulting memory leak/security issue that results in what should be mm code. It'll be a Saturday because it always is.. :) Even the example use code had a potential ref leak that you found [1]. > > > > We need to find another way. > > Could you suggest something? The minimum goal is to allow guest-memfd > support minor faults. Mike brought up another idea, that seems worth looking into. Thanks, Liam [1] https://lore.kernel.org/all/7455220c-e35b-4509-b7c3-a78fde5b12d5@amazon.com/
On Wed, Jul 02, 2025 at 10:00:51PM -0400, Liam R. Howlett wrote: > * Peter Xu <peterx@redhat.com> [250702 17:36]: > > On Wed, Jul 02, 2025 at 05:24:02PM -0400, Liam R. Howlett wrote: > > > That's because the entry point is from a function pointer, so [3] won't > > > help at all. > > > > > > It is recreating the situation that existed for the vma through the > > > vm_ops in mmap, but for uffd. And at a lower level (page tables). I do not > > > want to relive that experience. > > > > > > We are not doing this. It is for the benefit of everyone that we are > > > not doing this. > > > > Is the vma issue about "allowing vma->vm_flags to be modified anywhere" > > issue? Or is there a pointer to the issue being discussed if not? > > The issue is passing pointers of structs that are protected by locks or > ref counters into modules to do what they please. > > vma->vm_flags was an example of where we learned how wrong this can go. > > There is also the concern of the state of the folio on return from the > callback. The error handling gets messy quick. > > Now, imagine we have something that gets a folio, but then we find a > solution for contention of a lock or ref count (whatever is next), but > it doesn't work because the mm code has been bleeding into random > modules and we have no clue what that module is supposed to be doing, or > we can't make the necessary change because this module will break > userspace, or cause a performance decrease, or any other random thing > that we cannot work around without rewriting (probably suboptimally) > something we don't maintain. > > Again, these are examples of how this can go bad but not an exhaustive > list by any means. > > So the issue is with allowing modules to play with the folio and page > tables on their own. I understand the concern, however IMHO that's really why mm can be hard and important at the same time.. We definitely have driver code manipulating pgtables. We also have folios or pages that can be directly accessible from drivers. After all mm is the core function provider for those and there needs to be some API accessing them from outside. I agree some protection would be nice, like what Suren did with the vm_flags using __private, even though it's unfortunate it only works with sparse not a warn/error when compiling, as vm_flags is not a pointer. OTOH, forbid exposing anything might be an overkill, IMHO. It stops mm from growing in healthy ways. > > If this is outside the mm, we probably won't even be Cc'ed on modules > that use it. > > And do we want to be Cc'ed on modules that want to use it? For this specific case, I'm happy to be copied if guest-memfd will start to support userfaultfd, because obviously I also work with the kvm community. It'll be the same if not, as I'm list as an userfaultfd reviewer. But when it's in the modules, it should really be the modules job. It's ok too when it's an API then mm people do not get copied. It looks fine to me. > > We will most likely be Cc'ed or emailed directly on the resulting memory > leak/security issue that results in what should be mm code. It'll be a > Saturday because it always is.. :) True, it's just unavoidable IMHO, and after triaged then the module owner needs to figure out how to fix it, not a mm developer, if the bug only happens with the module. It's the same when a module allocated a folio/page and randomly update its flags. It may also crash core mm later. We can have more protections all over the places but I don't see an easy way to completely separate core mm from modules. > > Even the example use code had a potential ref leak that you found [1]. That's totally ok. I appreciate Nikita's help completely and never thought it as an issue. IMHO the leak is not a big deal in verifying the API. > > > > > > > We need to find another way. > > > > Could you suggest something? The minimum goal is to allow guest-memfd > > support minor faults. > > Mike brought up another idea, that seems worth looking into. I replied to Mike already before we extended this thread. Feel free to chime in with any suggestions on top. So far this series is still almost the best I can think of. Thanks, -- Peter Xu
* Peter Xu <peterx@redhat.com> [250703 11:24]: > On Wed, Jul 02, 2025 at 10:00:51PM -0400, Liam R. Howlett wrote: > > * Peter Xu <peterx@redhat.com> [250702 17:36]: > > > On Wed, Jul 02, 2025 at 05:24:02PM -0400, Liam R. Howlett wrote: > > > > That's because the entry point is from a function pointer, so [3] won't > > > > help at all. > > > > > > > > It is recreating the situation that existed for the vma through the > > > > vm_ops in mmap, but for uffd. And at a lower level (page tables). I do not > > > > want to relive that experience. > > > > > > > > We are not doing this. It is for the benefit of everyone that we are > > > > not doing this. > > > > > > Is the vma issue about "allowing vma->vm_flags to be modified anywhere" > > > issue? Or is there a pointer to the issue being discussed if not? > > > > The issue is passing pointers of structs that are protected by locks or > > ref counters into modules to do what they please. > > > > vma->vm_flags was an example of where we learned how wrong this can go. > > > > There is also the concern of the state of the folio on return from the > > callback. The error handling gets messy quick. > > > > Now, imagine we have something that gets a folio, but then we find a > > solution for contention of a lock or ref count (whatever is next), but > > it doesn't work because the mm code has been bleeding into random > > modules and we have no clue what that module is supposed to be doing, or > > we can't make the necessary change because this module will break > > userspace, or cause a performance decrease, or any other random thing > > that we cannot work around without rewriting (probably suboptimally) > > something we don't maintain. > > > > Again, these are examples of how this can go bad but not an exhaustive > > list by any means. > > > > So the issue is with allowing modules to play with the folio and page > > tables on their own. > > I understand the concern, however IMHO that's really why mm can be hard and > important at the same time.. I'm glad you understand, but I think you do not understand the severity of the concern and the repercussions. These patches, as they exist today, are not going to be accepted. I am not okay with it going in, and I don't see many saying differently. I am not okay with you pushing for it any longer. Several people have told you it is not a good idea, the people who have to deal with the fallout of this when it goes bad - and it _will_ go bad. You have been given ample reasons why, technical reasons as well as real examples of failures - and security bugs - that have resulted from the exact same pattern in the exact same structure where you are reproducing the patter. Please stop trying to push this plan. It is a waste of time and energy. We need a new plan. > > We definitely have driver code manipulating pgtables. We also have folios > or pages that can be directly accessible from drivers. > > After all mm is the core function provider for those and there needs to be > some API accessing them from outside. But that's not what you are doing, you are handing out pointers and expecting the modules to do the core function. And the core function that was suggested in the example user already had a ref counting issue. Bugs happen, but that sort of thing is going to be difficult to find and it won't be the driver author hunting it down, potentially under an embargo. And good luck validating what was done on return from the module. > > I agree some protection would be nice, like what Suren did with the > vm_flags using __private, even though it's unfortunate it only works with > sparse not a warn/error when compiling, as vm_flags is not a pointer. > OTOH, forbid exposing anything might be an overkill, IMHO. It stops mm > from growing in healthy ways. This isn't healthy, it is repeating the errors of the past and expecting a different result. This isn't about balance or forbidding any exposure, it's about finding a less bug-prone way of getting your feature to work. I'm not saying it's wrong TO do this. I'm saying these patches are the wrong way OF doing this - from experience of dealing with the same pattern. Anyways, I am not spending more time fighting about this. Let me know when you come up with an alternative approach. Regards, Liam
On Thu, Jul 03, 2025 at 11:24:21AM -0400, Peter Xu wrote: > On Wed, Jul 02, 2025 at 10:00:51PM -0400, Liam R. Howlett wrote: > > * Peter Xu <peterx@redhat.com> [250702 17:36]: > > > On Wed, Jul 02, 2025 at 05:24:02PM -0400, Liam R. Howlett wrote: > > > > That's because the entry point is from a function pointer, so [3] won't > > > > help at all. > > > > > > > > It is recreating the situation that existed for the vma through the > > > > vm_ops in mmap, but for uffd. And at a lower level (page tables). I do not > > > > want to relive that experience. > > > > > > > > We are not doing this. It is for the benefit of everyone that we are > > > > not doing this. > > > > > > Is the vma issue about "allowing vma->vm_flags to be modified anywhere" > > > issue? Or is there a pointer to the issue being discussed if not? > > > > The issue is passing pointers of structs that are protected by locks or > > ref counters into modules to do what they please. > > > > vma->vm_flags was an example of where we learned how wrong this can go. > > > > There is also the concern of the state of the folio on return from the > > callback. The error handling gets messy quick. > > > > Now, imagine we have something that gets a folio, but then we find a > > solution for contention of a lock or ref count (whatever is next), but > > it doesn't work because the mm code has been bleeding into random > > modules and we have no clue what that module is supposed to be doing, or > > we can't make the necessary change because this module will break > > userspace, or cause a performance decrease, or any other random thing > > that we cannot work around without rewriting (probably suboptimally) > > something we don't maintain. > > > > Again, these are examples of how this can go bad but not an exhaustive > > list by any means. > > > > So the issue is with allowing modules to play with the folio and page > > tables on their own. > > I understand the concern, however IMHO that's really why mm can be hard and > important at the same time.. I feel like you're dismissing the concerns altogether honestly. > > We definitely have driver code manipulating pgtables. We also have folios > or pages that can be directly accessible from drivers. 'There's already bad stuff going on' is not an argument for doing more. > > After all mm is the core function provider for those and there needs to be > some API accessing them from outside. The point being made here is what are internals and what are not. We don't expose internals, we do expose a carefully controlled interface that minimises risk of things being broken. > > I agree some protection would be nice, like what Suren did with the > vm_flags using __private, even though it's unfortunate it only works with > sparse not a warn/error when compiling, as vm_flags is not a pointer. > OTOH, forbid exposing anything might be an overkill, IMHO. It stops mm > from growing in healthy ways. I'm not sure what is healthy about no longer being able to make assumptions about what code does because hooks are being invoked with drivers doing _whatever they like_. Part of the purpose of review is avoiding making decisions that cause problems down the line. > > > > > If this is outside the mm, we probably won't even be Cc'ed on modules > > that use it. > > > > And do we want to be Cc'ed on modules that want to use it? > > For this specific case, I'm happy to be copied if guest-memfd will start to > support userfaultfd, because obviously I also work with the kvm community. > It'll be the same if not, as I'm list as an userfaultfd reviewer. > > But when it's in the modules, it should really be the modules job. It's ok > too when it's an API then mm people do not get copied. It looks fine to me. This is ridiculous, we expose mm internals to modules, and then no longer have to care when they break things, or a subtle thing changes in mm? On the one hand you argue that in-tree drivers can be 'monitored' + therefore it's fine, but on the other hand you say it doesn't matter what they do and it's nothing to do with us so we shouldn't care about being infomred of changes? These two positions are contradcitory and neither are good. > > > > > We will most likely be Cc'ed or emailed directly on the resulting memory > > leak/security issue that results in what should be mm code. It'll be a > > Saturday because it always is.. :) > > True, it's just unavoidable IMHO, and after triaged then the module owner > needs to figure out how to fix it, not a mm developer, if the bug only > happens with the module. Except it impacts mm as a whole. And it is avoidable, we can just _not do this_. > > It's the same when a module allocated a folio/page and randomly update its > flags. It may also crash core mm later. We can have more protections all > over the places but I don't see an easy way to completely separate core mm > from modules. Yes if modules absolutely abuse things then it's problematic, but the issue is that mm has a whole host of extremely subtle considerations. I documented a lot of these at https://kernel.org/doc/html/latest/mm/process_addrs.html These details change quite often, including some extremely sensitive and delicate details around locking. Do yo really think module authors are going to correctly implement all of this? It's very obvious these are internal implementation details. > > > > > Even the example use code had a potential ref leak that you found [1]. > > That's totally ok. I appreciate Nikita's help completely and never thought > it as an issue. IMHO the leak is not a big deal in verifying the API. I think it's quite telling as to your under-worrying :) about this stuff to think that that's not a problem. You guys have already made subtle errors in implementing the simplest possible use of the API. This just makes the point for us I think? > > > > > > > > > > > We need to find another way. > > > > > > Could you suggest something? The minimum goal is to allow guest-memfd > > > support minor faults. > > > > Mike brought up another idea, that seems worth looking into. > > I replied to Mike already before we extended this thread. Feel free to > chime in with any suggestions on top. So far this series is still almost > the best I can think of. I mean if you don't want to consider alternatives, maybe this series simply isn't viable? I made suggestions in the other thread re: a very _soft_ interface where we implement things in core mm but _configure_ them in modules as an altenrative. That would avoid exposure of mm internals. I really don't think a series that exposes anything even vaguely sensitive is viable imo. > > Thanks, > > -- > Peter Xu >
On Wed, Jul 02, 2025 at 06:08:44PM +0100, Nikita Kalyazin wrote: > > > On 02/07/2025 16:56, Lorenzo Stoakes wrote: > > On Tue, Jul 01, 2025 at 10:04:28AM -0700, Suren Baghdasaryan wrote: > > > On Mon, Jun 30, 2025 at 3:16 AM Lorenzo Stoakes > > > <lorenzo.stoakes@oracle.com> wrote: > > > > This feels like you're trying to put mm functionality outside of mm? > > > > > > To second that, two things stick out for me here: > > > 1. uffd_copy and uffd_get_folio seem to be at different abstraction > > > levels. uffd_copy is almost the entire copy operation for VM_SHARED > > > VMAs while uffd_get_folio is a small part of the continue operation. > > > 2. shmem_mfill_atomic_pte which becomes uffd_copy for shmem in the > > > last patch is quite a complex function which itself calls some IMO > > > pretty internal functions like mfill_atomic_install_pte(). Expecting > > > modules to implement such functionality seems like a stretch to me but > > > maybe this is for some specialized modules which are written by mm > > > experts only? > > > > To echo what Liam said - I don't think we can truly rely on expertise here > > (we make enough mistakes in core mm for that to be a dubious proposition > > even tere :) and even if experts were involved, having core mm > > functionality outside of core mm carries significant risk - we are > > constantly changing things, including assumptions around sensitive topics > > such as locking (think VMA locking) - having code elsewhere significantly > > increases the risk of missing things. > > > > I am also absolutely, to be frank, not going to accept us EXPORT()'ing > > anything core. > > > > Page table manipulation really must rely in core mm and arch code only, it > > is easily some of the most subtle, confusing and dangerous code in mm (I > > have spent subtantial hours banging my head against it recently), and again > > - subject to constant change. > > > > But to come back to Liam's comments and to reiterate what I was referring > > to earlier, even permitting drivers to have access to VMAs is _highly_ > > problematic and has resulted in very real bugs and subtle issues that took > > many hours, much stress + gnashing of teeth to adress. > > The main target of this change is the implementation of UFFD for > KVM/guest_memfd (examples: [1], [2]) to avoid bringing KVM-specific code > into the mm codebase. We usually mean KVM by the "drivers" in this context, > and it is already somewhat "knowledgeable" of the mm. I don't think there > are existing use cases for other drivers to implement this at the moment. > > Although I can't see new exports in this series, there is now a way to limit > exports to particular modules [3]. Would it help if we only do it for KVM > initially (if/when actually needed)? There were talks about pulling out guest_memfd core into mm, but I don't remember patches about it. If parts of guest_memfd were already in mm/ that would make easier to export uffd ops to it. > [1] > https://lore.kernel.org/all/114133f5-0282-463d-9d65-3143aa658806@amazon.com/ > [2] > https://lore.kernel.org/all/7666ee96-6f09-4dc1-8cb2-002a2d2a29cf@amazon.com/ > [3] https://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git/commit/?h=kbuild&id=707f853d7fa3ce323a6875487890c213e34d81a0 > > Thanks, > Nikita > > > > > The very thing of: > > > > xxx > > <hand off sensitive mutable state X, Y, Z to driver> > > yyy > > > > Means that between xxx and yyy we can make literally no assumptions about > > what just happened to all handed off state. A single instance of this has > > caused mayhem, if we did this in such a way as to affect the _many_ uffd > > hooks we could have a realy serious problem. > > > > So - what seems really positive about this series is the _generalisation_ > > and _abstraction_ of uffd functionality. > > > > That is something I appreciate and I think uffd sorely needs, in fact if we > > could find a way to not need to do: > > > > if (some_uffd_predicate()) > > some_uffd_specific_fn(); > > > > That'd be incredible. > > > > So I think the answer here is to do something like this, and to keep all > > the mm-specific code in core mm. > > > > Thanks, Lorenzo > -- Sincerely yours, Mike.
On Wed, Jul 02, 2025 at 08:39:32PM +0300, Mike Rapoport wrote: [...] > > The main target of this change is the implementation of UFFD for > > KVM/guest_memfd (examples: [1], [2]) to avoid bringing KVM-specific code > > into the mm codebase. We usually mean KVM by the "drivers" in this context, > > and it is already somewhat "knowledgeable" of the mm. I don't think there > > are existing use cases for other drivers to implement this at the moment. > > > > Although I can't see new exports in this series, there is now a way to limit > > exports to particular modules [3]. Would it help if we only do it for KVM > > initially (if/when actually needed)? > > There were talks about pulling out guest_memfd core into mm, but I don't > remember patches about it. If parts of guest_memfd were already in mm/ that > would make easier to export uffd ops to it. Do we have a link to such discussion? I'm also curious whether that idea was acknowledged by KVM maintainers. Having an abstraction layer for userfaultfd memtypes within mm always makes some sense on its own to me, so as to remove separate random checks over either shmem or hugetlb. E.g. after the series applied, we can drop the shmem header in userfaultfd code, which should also be a step forward. Thanks, > > > [1] > > https://lore.kernel.org/all/114133f5-0282-463d-9d65-3143aa658806@amazon.com/ > > [2] > > https://lore.kernel.org/all/7666ee96-6f09-4dc1-8cb2-002a2d2a29cf@amazon.com/ > > [3] https://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git/commit/?h=kbuild&id=707f853d7fa3ce323a6875487890c213e34d81a0 -- Peter Xu
On Wed, Jul 02, 2025 at 03:46:57PM -0400, Peter Xu wrote: > On Wed, Jul 02, 2025 at 08:39:32PM +0300, Mike Rapoport wrote: > > [...] > > > > The main target of this change is the implementation of UFFD for > > > KVM/guest_memfd (examples: [1], [2]) to avoid bringing KVM-specific code > > > into the mm codebase. We usually mean KVM by the "drivers" in this context, > > > and it is already somewhat "knowledgeable" of the mm. I don't think there > > > are existing use cases for other drivers to implement this at the moment. > > > > > > Although I can't see new exports in this series, there is now a way to limit > > > exports to particular modules [3]. Would it help if we only do it for KVM > > > initially (if/when actually needed)? > > > > There were talks about pulling out guest_memfd core into mm, but I don't > > remember patches about it. If parts of guest_memfd were already in mm/ that > > would make easier to export uffd ops to it. > > Do we have a link to such discussion? I'm also curious whether that idea > was acknowledged by KVM maintainers. AFAIR it was discussed at one of David's guest_memfd calls > Having an abstraction layer for userfaultfd memtypes within mm always makes > some sense on its own to me, so as to remove separate random checks over > either shmem or hugetlb. E.g. after the series applied, we can drop the > shmem header in userfaultfd code, which should also be a step forward. > > Thanks, > > > > > > [1] > > > https://lore.kernel.org/all/114133f5-0282-463d-9d65-3143aa658806@amazon.com/ > > > [2] > > > https://lore.kernel.org/all/7666ee96-6f09-4dc1-8cb2-002a2d2a29cf@amazon.com/ > > > [3] https://git.kernel.org/pub/scm/linux/kernel/git/masahiroy/linux-kbuild.git/commit/?h=kbuild&id=707f853d7fa3ce323a6875487890c213e34d81a0 > > -- > Peter Xu > -- Sincerely yours, Mike.
On 03.07.25 19:48, Mike Rapoport wrote: > On Wed, Jul 02, 2025 at 03:46:57PM -0400, Peter Xu wrote: >> On Wed, Jul 02, 2025 at 08:39:32PM +0300, Mike Rapoport wrote: >> >> [...] >> >>>> The main target of this change is the implementation of UFFD for >>>> KVM/guest_memfd (examples: [1], [2]) to avoid bringing KVM-specific code >>>> into the mm codebase. We usually mean KVM by the "drivers" in this context, >>>> and it is already somewhat "knowledgeable" of the mm. I don't think there >>>> are existing use cases for other drivers to implement this at the moment. >>>> >>>> Although I can't see new exports in this series, there is now a way to limit >>>> exports to particular modules [3]. Would it help if we only do it for KVM >>>> initially (if/when actually needed)? >>> >>> There were talks about pulling out guest_memfd core into mm, but I don't >>> remember patches about it. If parts of guest_memfd were already in mm/ that >>> would make easier to export uffd ops to it. >> >> Do we have a link to such discussion? I'm also curious whether that idea >> was acknowledged by KVM maintainers. > > AFAIR it was discussed at one of David's guest_memfd calls While it was discussed in the call a couple of times in different context (guest_memfd as a library / guest_memfd shim), I think we already discussed it back at LPC last year. One of the main reasons for doing that is supporting guest_memfd in other hypervisors -- the gunyah hypervisor in the kernel wants to make use of it as well. -- Cheers, David / dhildenb
On Fri, Jul 04, 2025 at 11:34:15AM +0200, David Hildenbrand wrote: > On 03.07.25 19:48, Mike Rapoport wrote: > > On Wed, Jul 02, 2025 at 03:46:57PM -0400, Peter Xu wrote: > > > On Wed, Jul 02, 2025 at 08:39:32PM +0300, Mike Rapoport wrote: > > > > > > [...] > > > > > > > > The main target of this change is the implementation of UFFD for > > > > > KVM/guest_memfd (examples: [1], [2]) to avoid bringing KVM-specific code > > > > > into the mm codebase. We usually mean KVM by the "drivers" in this context, > > > > > and it is already somewhat "knowledgeable" of the mm. I don't think there > > > > > are existing use cases for other drivers to implement this at the moment. > > > > > > > > > > Although I can't see new exports in this series, there is now a way to limit > > > > > exports to particular modules [3]. Would it help if we only do it for KVM > > > > > initially (if/when actually needed)? > > > > > > > > There were talks about pulling out guest_memfd core into mm, but I don't > > > > remember patches about it. If parts of guest_memfd were already in mm/ that > > > > would make easier to export uffd ops to it. > > > > > > Do we have a link to such discussion? I'm also curious whether that idea > > > was acknowledged by KVM maintainers. > > > > AFAIR it was discussed at one of David's guest_memfd calls > > While it was discussed in the call a couple of times in different context > (guest_memfd as a library / guest_memfd shim), I think we already discussed > it back at LPC last year. > > One of the main reasons for doing that is supporting guest_memfd in other > hypervisors -- the gunyah hypervisor in the kernel wants to make use of it > as well. I see, thanks for the info. I found the series, it's here: https://lore.kernel.org/all/20241113-guestmem-library-v3-0-71fdee85676b@quicinc.com/ Here, the question is whether do we still want to keep explicit calls to shmem, hugetlbfs and in the future, guest-memfd. The library-ize of guest-memfd doesn't change a huge lot on answering this question, IIUC. It definitely reduces the use of mfill_atomic_install_pte() so that we don't need to export it. However if we want to generalize userfaultfd capability for a type of memory, we will still need something like the vm_uffd_ops hook to report such information. It means drivers can still overwrite these, with/without an exported mfill_atomic_install_pte() functions. I'm not sure whether that eases the concern. So to me, generalizing the mem type looks helpful with/without moving guest-memfd under mm/. We do have the option to keep hard-code guest-memfd like shmem or hugetlbfs. This is still "doable", but this likely means guest-memfd support for userfaultfd needs to be done after that work. I did quickly check the status of gunyah hypervisor [1,2,3], I found that all of the efforts are not yet continued in 2025. The hypervisor last update was Jan 2024 with a batch push [1]. I still prefer generalizing uffd capabilities using the ops. That makes guest-memfd support on MINOR not be blocked and it should be able to be done concurrently v.s. guest-memfd library. If guest-memfd library idea didn't move on, it's non-issue either. I've considered dropping uffd_copy() and MISSING support for vm_uffd_ops if I'm going to repost - that looks like the only thing that people are against with, even though that is not my preference, as that'll make the API half-broken on its own. Said that, I still prefer this against hard-code and/or use CONFIG_GUESTMEM in userfaultfd code. I'll wait for a few more days to see whether there's comment on above plan to drop uffd_copy(). Thanks, [1] https://github.com/quic/gunyah-hypervisor/tree/develop/hyp [2] https://lore.kernel.org/all/20240516143356.1739402-1-quic_svaddagi@quicinc.com/ [3] https://lore.kernel.org/lkml/20240222-gunyah-v17-0-1e9da6763d38@quicinc.com -- Peter Xu
* Peter Xu <peterx@redhat.com> [250704 11:00]: > On Fri, Jul 04, 2025 at 11:34:15AM +0200, David Hildenbrand wrote: > > On 03.07.25 19:48, Mike Rapoport wrote: > > > On Wed, Jul 02, 2025 at 03:46:57PM -0400, Peter Xu wrote: > > > > On Wed, Jul 02, 2025 at 08:39:32PM +0300, Mike Rapoport wrote: > > > > > > > > [...] > > > > > > > > > > The main target of this change is the implementation of UFFD for > > > > > > KVM/guest_memfd (examples: [1], [2]) to avoid bringing KVM-specific code > > > > > > into the mm codebase. We usually mean KVM by the "drivers" in this context, > > > > > > and it is already somewhat "knowledgeable" of the mm. I don't think there > > > > > > are existing use cases for other drivers to implement this at the moment. > > > > > > > > > > > > Although I can't see new exports in this series, there is now a way to limit > > > > > > exports to particular modules [3]. Would it help if we only do it for KVM > > > > > > initially (if/when actually needed)? > > > > > > > > > > There were talks about pulling out guest_memfd core into mm, but I don't > > > > > remember patches about it. If parts of guest_memfd were already in mm/ that > > > > > would make easier to export uffd ops to it. > > > > > > > > Do we have a link to such discussion? I'm also curious whether that idea > > > > was acknowledged by KVM maintainers. > > > > > > AFAIR it was discussed at one of David's guest_memfd calls > > > > While it was discussed in the call a couple of times in different context > > (guest_memfd as a library / guest_memfd shim), I think we already discussed > > it back at LPC last year. > > > > One of the main reasons for doing that is supporting guest_memfd in other > > hypervisors -- the gunyah hypervisor in the kernel wants to make use of it > > as well. > > I see, thanks for the info. I found the series, it's here: > > https://lore.kernel.org/all/20241113-guestmem-library-v3-0-71fdee85676b@quicinc.com/ > > Here, the question is whether do we still want to keep explicit calls to > shmem, hugetlbfs and in the future, guest-memfd. The library-ize of > guest-memfd doesn't change a huge lot on answering this question, IIUC. Can you explore moving hugetlb_mfill_atomic_pte and shmem_mfill_atomic_pte into mm/userfaultfd.c and generalizing them to use the same code? That is, split the similar blocks into functions and reduce duplication. These are under the UFFD config option and are pretty similar. This will also limit the bleeding of mfill_atomic_mode out of uffd. If you look at what the code does in userfaultfd.c, you can see that some refactoring is necessary for other reasons: mfill_atomic() calls mfill_atomic_hugetlb(), or it enters a while (src_addr < src_start + len) to call mfill_atomic_pte().. which might call shmem_mfill_atomic_pte() in mm/shmem.c mfill_atomic_hugetlb() calls, in a while (src_addr < src_start + len) loop and calls hugetlb_mfill_atomic_pte() in mm/hugetlb.c The shmem call already depends on the vma flags.. which it still does in your patch 4 here. So you've replaced this: if (!(dst_vma->vm_flags & VM_SHARED)) { ... } else { shmem_mfill_atomic_pte() } With... if (!(dst_vma->vm_flags & VM_SHARED)) { ... } else { ... uffd_ops->uffd_copy() } So, really, what needs to happen first is userfaultfd needs to be sorted. There's no point of creating a vm_ops_uffd if it will just serve as replacing the call locations of the functions like this, as it has done nothing to simplify the logic. > However if we want to generalize userfaultfd capability for a type of > memory, we will still need something like the vm_uffd_ops hook to report > such information. It means drivers can still overwrite these, with/without > an exported mfill_atomic_install_pte() functions. I'm not sure whether > that eases the concern. If we work through the duplication and reduction where possible, the path forward may be easier to see. > > So to me, generalizing the mem type looks helpful with/without moving > guest-memfd under mm/. Yes, it should decrease the duplication across hugetlb.c and shmem.c, but I think that userfaultfd is the place to start. > > We do have the option to keep hard-code guest-memfd like shmem or > hugetlbfs. This is still "doable", but this likely means guest-memfd > support for userfaultfd needs to be done after that work. I did quickly > check the status of gunyah hypervisor [1,2,3], I found that all of the > efforts are not yet continued in 2025. The hypervisor last update was Jan > 2024 with a batch push [1]. > > I still prefer generalizing uffd capabilities using the ops. That makes > guest-memfd support on MINOR not be blocked and it should be able to be > done concurrently v.s. guest-memfd library. If guest-memfd library idea > didn't move on, it's non-issue either. > > I've considered dropping uffd_copy() and MISSING support for vm_uffd_ops if > I'm going to repost - that looks like the only thing that people are > against with, even though that is not my preference, as that'll make the > API half-broken on its own. The generalisation you did does not generalize much, as I pointed out above, and so it seems less impactful than it could be. These patches also do not explore what this means for guest_memfd. So it is not clear that the expected behaviour will serve the need. You sent a link to an example user. Can you please keep this work together in the patch set so that we know it'll work for your use case and allows us an easier way to pull down this work so we can examine it. Alternatively, you can reduce and combine the memory types without exposing the changes externally, if they stand on their own. But I don't think anyone is going to accept using a vm_ops change where a direct function call could be used. > Said that, I still prefer this against > hard-code and/or use CONFIG_GUESTMEM in userfaultfd code. It also does nothing with regards to the CONFIG_USERFAULTFD in other areas. My hope is that you could pull out the common code and make the CONFIG_ sections smaller. And, by the way, it isn't the fact that we're going to copy something (or mfill_atomic it, I guess?) but the fact that we're handing out the pointer for something else to do that. Please handle this manipulation in the core mm code without handing out pointers to folios, or page tables. You could do this with the address being passed in and figure out the type, or even a function pointer that you specifically pass in an enum of the type (I think this is what Lorenzo was suggesting somewhere), maybe with multiple flags for actions and fallback (MFILL|ZERO for example). Thanks, Liam
On Fri, Jul 04, 2025 at 03:39:32PM -0400, Liam R. Howlett wrote: > * Peter Xu <peterx@redhat.com> [250704 11:00]: > > On Fri, Jul 04, 2025 at 11:34:15AM +0200, David Hildenbrand wrote: > > > On 03.07.25 19:48, Mike Rapoport wrote: > > > > On Wed, Jul 02, 2025 at 03:46:57PM -0400, Peter Xu wrote: > > > > > On Wed, Jul 02, 2025 at 08:39:32PM +0300, Mike Rapoport wrote: > > > > > > > > > > [...] > > > > > > > > > > > > The main target of this change is the implementation of UFFD for > > > > > > > KVM/guest_memfd (examples: [1], [2]) to avoid bringing KVM-specific code > > > > > > > into the mm codebase. We usually mean KVM by the "drivers" in this context, > > > > > > > and it is already somewhat "knowledgeable" of the mm. I don't think there > > > > > > > are existing use cases for other drivers to implement this at the moment. > > > > > > > > > > > > > > Although I can't see new exports in this series, there is now a way to limit > > > > > > > exports to particular modules [3]. Would it help if we only do it for KVM > > > > > > > initially (if/when actually needed)? > > > > > > > > > > > > There were talks about pulling out guest_memfd core into mm, but I don't > > > > > > remember patches about it. If parts of guest_memfd were already in mm/ that > > > > > > would make easier to export uffd ops to it. > > > > > > > > > > Do we have a link to such discussion? I'm also curious whether that idea > > > > > was acknowledged by KVM maintainers. > > > > > > > > AFAIR it was discussed at one of David's guest_memfd calls > > > > > > While it was discussed in the call a couple of times in different context > > > (guest_memfd as a library / guest_memfd shim), I think we already discussed > > > it back at LPC last year. > > > > > > One of the main reasons for doing that is supporting guest_memfd in other > > > hypervisors -- the gunyah hypervisor in the kernel wants to make use of it > > > as well. > > > > I see, thanks for the info. I found the series, it's here: > > > > https://lore.kernel.org/all/20241113-guestmem-library-v3-0-71fdee85676b@quicinc.com/ > > > > Here, the question is whether do we still want to keep explicit calls to > > shmem, hugetlbfs and in the future, guest-memfd. The library-ize of > > guest-memfd doesn't change a huge lot on answering this question, IIUC. > > Can you explore moving hugetlb_mfill_atomic_pte and > shmem_mfill_atomic_pte into mm/userfaultfd.c and generalizing them to > use the same code? > > That is, split the similar blocks into functions and reduce duplication. > > These are under the UFFD config option and are pretty similar. This > will also limit the bleeding of mfill_atomic_mode out of uffd. These are different problems to solve. I did propose a refactoring of hugetlbfs in general years ago. I didn't finish that. It's not only because it's a huge amount of work that almost nobody likes to review (even if everybody likes to see landing), also that since we decided to not add more complexity into hugetlbfs code like HGM there's also less point on refactoring. I hope we can be on the same page to understand the problem we want to solve here. The problem is we want to support userfaultfd on guest-memfd. Your suggestion could be helpful, but IMHO it's orthogonal to the current problem. It's also not a dependency. If it is, you should see code duplications introduced in this series, which might not be needed after a cleanup. It's not like that. This series simply resolves a totally different problem. The order on solving this problem or cleaning things elsewhere (or we decide to not do it at all) are not deterministic, IMHO. > > > > If you look at what the code does in userfaultfd.c, you can see that > some refactoring is necessary for other reasons: > > mfill_atomic() calls mfill_atomic_hugetlb(), or it enters a while > (src_addr < src_start + len) to call mfill_atomic_pte().. which might > call shmem_mfill_atomic_pte() in mm/shmem.c > > mfill_atomic_hugetlb() calls, in a while (src_addr < src_start + len) > loop and calls hugetlb_mfill_atomic_pte() in mm/hugetlb.c Again, IMHO this still falls into hugetlbfs refactoring category. I would sincerely request that we put that as a separate topic to discuss, because it's huge and community resources are limited on both developments and reviews. Shmem is already almost sharing code with anonymous, after all these two memory types are the only ones that support userfaultfd besides hugetlbfs. > > The shmem call already depends on the vma flags.. which it still does in > your patch 4 here. So you've replaced this: > > if (!(dst_vma->vm_flags & VM_SHARED)) { > ... > } else { > shmem_mfill_atomic_pte() > } > > With... > > if (!(dst_vma->vm_flags & VM_SHARED)) { > ... > } else { > ... > uffd_ops->uffd_copy() > } I'm not 100% sure I get this comment. It's intentional to depend on vma flags here for shmem. See Andrea's commit: commit 5b51072e97d587186c2f5390c8c9c1fb7e179505 Author: Andrea Arcangeli <aarcange@redhat.com> Date: Fri Nov 30 14:09:28 2018 -0800 userfaultfd: shmem: allocate anonymous memory for MAP_PRIVATE shmem Are you suggesting we should merge it somehow? I'd appreciate some concrete example here if so. > > So, really, what needs to happen first is userfaultfd needs to be > sorted. > > There's no point of creating a vm_ops_uffd if it will just serve as > replacing the call locations of the functions like this, as it has done > nothing to simplify the logic. I had a vague feeling that you may not have understood the goal of this series. I thought we were on the same page there. It could be partly my fault if that's not obvious, I can try to be more explicit in the cover letter next if I'm going to repost. Please consider supporting guest-memfd as the goal, vm_ops_uffd is to allow all changes to happen in guest-memfd.c. Without it, it can't happen. That's the whole point so far. > > > However if we want to generalize userfaultfd capability for a type of > > memory, we will still need something like the vm_uffd_ops hook to report > > such information. It means drivers can still overwrite these, with/without > > an exported mfill_atomic_install_pte() functions. I'm not sure whether > > that eases the concern. > > If we work through the duplication and reduction where possible, the > path forward may be easier to see. > > > > > So to me, generalizing the mem type looks helpful with/without moving > > guest-memfd under mm/. > > Yes, it should decrease the duplication across hugetlb.c and shmem.c, > but I think that userfaultfd is the place to start. > > > > > We do have the option to keep hard-code guest-memfd like shmem or > > hugetlbfs. This is still "doable", but this likely means guest-memfd > > support for userfaultfd needs to be done after that work. I did quickly > > check the status of gunyah hypervisor [1,2,3], I found that all of the > > efforts are not yet continued in 2025. The hypervisor last update was Jan > > 2024 with a batch push [1]. > > > > I still prefer generalizing uffd capabilities using the ops. That makes > > guest-memfd support on MINOR not be blocked and it should be able to be > > done concurrently v.s. guest-memfd library. If guest-memfd library idea > > didn't move on, it's non-issue either. > > > > I've considered dropping uffd_copy() and MISSING support for vm_uffd_ops if > > I'm going to repost - that looks like the only thing that people are > > against with, even though that is not my preference, as that'll make the > > API half-broken on its own. > > The generalisation you did does not generalize much, as I pointed out > above, and so it seems less impactful than it could be. > > These patches also do not explore what this means for guest_memfd. So > it is not clear that the expected behaviour will serve the need. > > You sent a link to an example user. Can you please keep this work > together in the patch set so that we know it'll work for your use case > and allows us an easier way to pull down this work so we can examine it. > > Alternatively, you can reduce and combine the memory types without > exposing the changes externally, if they stand on their own. But I > don't think anyone is going to accept using a vm_ops change where a > direct function call could be used. While I was absent, Nikita sent a version that is based on the library code. That uses direct function calls: https://lore.kernel.org/all/20250915161815.40729-3-kalyazin@amazon.com/ I think it's great we have the other sample of implementing this. I believe Nikita has no strong opinion how this will land at last, whatever way the community prefers. I prefer this solution I sent. Do you have a preference, when explicitly there's the other one to compare? > > > Said that, I still prefer this against > > hard-code and/or use CONFIG_GUESTMEM in userfaultfd code. > > It also does nothing with regards to the CONFIG_USERFAULTFD in other > areas. My hope is that you could pull out the common code and make the > CONFIG_ sections smaller. > > And, by the way, it isn't the fact that we're going to copy something > (or mfill_atomic it, I guess?) but the fact that we're handing out the > pointer for something else to do that. Please handle this manipulation > in the core mm code without handing out pointers to folios, or page > tables. Is "return pointers to folios" also an issue now in the API? Are you NACKing uffd_get_folio() API that this series proposed? > > You could do this with the address being passed in and figure out the > type, or even a function pointer that you specifically pass in an enum > of the type (I think this is what Lorenzo was suggesting somewhere), > maybe with multiple flags for actions and fallback (MFILL|ZERO for > example). I didn't quickly get it. I appreciate if there's some more elaborations. Thanks, -- Peter Xu
* Peter Xu <peterx@redhat.com> [250916 15:56]: ... First, thanks for replying. Sorry I missed it. > > > > Can you explore moving hugetlb_mfill_atomic_pte and > > shmem_mfill_atomic_pte into mm/userfaultfd.c and generalizing them to > > use the same code? > > > > That is, split the similar blocks into functions and reduce duplication. > > > > These are under the UFFD config option and are pretty similar. This > > will also limit the bleeding of mfill_atomic_mode out of uffd. > > These are different problems to solve. > > I did propose a refactoring of hugetlbfs in general years ago. I didn't > finish that. It's not only because it's a huge amount of work that almost > nobody likes to review (even if everybody likes to see landing), also that > since we decided to not add more complexity into hugetlbfs code like HGM > there's also less point on refactoring. > > I hope we can be on the same page to understand the problem we want to > solve here. The problem is we want to support userfaultfd on guest-memfd. > Your suggestion could be helpful, but IMHO it's orthogonal to the current > problem. It's also not a dependency. If it is, you should see code > duplications introduced in this series, which might not be needed after a > cleanup. It's not like that. > > This series simply resolves a totally different problem. The order on > solving this problem or cleaning things elsewhere (or we decide to not do > it at all) are not deterministic, IMHO. I find it very difficult to see what abstraction of functions are needed with the code in the current state. I feel like we are exposing an interface that we cannot see what we need. > > > > > > > > > If you look at what the code does in userfaultfd.c, you can see that > > some refactoring is necessary for other reasons: > > > > mfill_atomic() calls mfill_atomic_hugetlb(), or it enters a while > > (src_addr < src_start + len) to call mfill_atomic_pte().. which might > > call shmem_mfill_atomic_pte() in mm/shmem.c > > > > mfill_atomic_hugetlb() calls, in a while (src_addr < src_start + len) > > loop and calls hugetlb_mfill_atomic_pte() in mm/hugetlb.c > > Again, IMHO this still falls into hugetlbfs refactoring category. I would > sincerely request that we put that as a separate topic to discuss, because > it's huge and community resources are limited on both developments and > reviews. > > Shmem is already almost sharing code with anonymous, after all these two > memory types are the only ones that support userfaultfd besides hugetlbfs. > > > > > The shmem call already depends on the vma flags.. which it still does in > > your patch 4 here. So you've replaced this: > > > > if (!(dst_vma->vm_flags & VM_SHARED)) { > > ... > > } else { > > shmem_mfill_atomic_pte() > > } > > > > With... > > > > if (!(dst_vma->vm_flags & VM_SHARED)) { > > ... > > } else { > > ... > > uffd_ops->uffd_copy() > > } > > I'm not 100% sure I get this comment. It's intentional to depend on vma > flags here for shmem. See Andrea's commit: > > commit 5b51072e97d587186c2f5390c8c9c1fb7e179505 > Author: Andrea Arcangeli <aarcange@redhat.com> > Date: Fri Nov 30 14:09:28 2018 -0800 > > userfaultfd: shmem: allocate anonymous memory for MAP_PRIVATE shmem > > Are you suggesting we should merge it somehow? I'd appreciate some > concrete example here if so. What I am saying is that the containing function, mfill_atomic_pte(), should be absorbed into each memory type's ->uffd_copy(), at least partially. Shouldn't each memory type do all the necessary checks in ->uffd_copy()? To put it another way, no shared memory vma will enter the if() statement, so why are we checking if they need to? So if the default uffd_copy() does the if (!shared) stuff, you can just call uffd_ops->uffd_copy() with out any check there, right? > > > > > So, really, what needs to happen first is userfaultfd needs to be > > sorted. > > > > There's no point of creating a vm_ops_uffd if it will just serve as > > replacing the call locations of the functions like this, as it has done > > nothing to simplify the logic. > > I had a vague feeling that you may not have understood the goal of this > series. I thought we were on the same page there. It could be partly my > fault if that's not obvious, I can try to be more explicit in the cover > letter next if I'm going to repost. I certainly feel like I've lost the overall goal during the conversations sometimes, thanks for pointing that out. I keep thinking this is about memory type modularization, I think the cover letter subject keeps getting in my way. > > Please consider supporting guest-memfd as the goal, vm_ops_uffd is to allow > all changes to happen in guest-memfd.c. Without it, it can't happen. > That's the whole point so far. > ... > > While I was absent, Nikita sent a version that is based on the library > code. That uses direct function calls: > > https://lore.kernel.org/all/20250915161815.40729-3-kalyazin@amazon.com/ > > I think it's great we have the other sample of implementing this. > > I believe Nikita has no strong opinion how this will land at last, whatever > way the community prefers. I prefer this solution I sent. > > Do you have a preference, when explicitly there's the other one to compare? I like this better. > > > > > > Said that, I still prefer this against > > > hard-code and/or use CONFIG_GUESTMEM in userfaultfd code. > > > > It also does nothing with regards to the CONFIG_USERFAULTFD in other > > areas. My hope is that you could pull out the common code and make the > > CONFIG_ sections smaller. > > > > And, by the way, it isn't the fact that we're going to copy something > > (or mfill_atomic it, I guess?) but the fact that we're handing out the > > pointer for something else to do that. Please handle this manipulation > > in the core mm code without handing out pointers to folios, or page > > tables. > > Is "return pointers to folios" also an issue now in the API? Are you > NACKing uffd_get_folio() API that this series proposed? No, I don't think I see the problem with what I'm proposing. I'd rather find a middle ground. I was thinking it best to have the part needing folio pointer to be in the mm while the rest remains in the guestmem implementation. The other thread seems to imply that it would all have to be pulled in, and that seems undesirable as well. It is just that we have had this sort of interface before and it has gone poorly for us. I know there's other ways to do what you are doing here in regards to ->fault(), but that's the most flexible and hardest to validate way of doing it. If we can avoid exposing it in this way, I'd rather do that. > > > > > You could do this with the address being passed in and figure out the > > type, or even a function pointer that you specifically pass in an enum > > of the type (I think this is what Lorenzo was suggesting somewhere), > > maybe with multiple flags for actions and fallback (MFILL|ZERO for > > example). > > I didn't quickly get it. I appreciate if there's some more elaborations. What I was thinking was the same sort of thing that happens on the ioctl today, but with a memory type, and only for the callback that needs the folio. But again, I'm sure I'm missing something and I'm sorry to drag this out.. And sorry if I upset you, I feel like we're talking past each other and causing a lot of stress. Thanks, Liam
On Fri, Sep 19, 2025 at 01:22:01PM -0400, Liam R. Howlett wrote: > > > The shmem call already depends on the vma flags.. which it still does in > > > your patch 4 here. So you've replaced this: > > > > > > if (!(dst_vma->vm_flags & VM_SHARED)) { > > > ... > > > } else { > > > shmem_mfill_atomic_pte() > > > } > > > > > > With... > > > > > > if (!(dst_vma->vm_flags & VM_SHARED)) { > > > ... > > > } else { > > > ... > > > uffd_ops->uffd_copy() > > > } > > > > I'm not 100% sure I get this comment. It's intentional to depend on vma > > flags here for shmem. See Andrea's commit: > > > > commit 5b51072e97d587186c2f5390c8c9c1fb7e179505 > > Author: Andrea Arcangeli <aarcange@redhat.com> > > Date: Fri Nov 30 14:09:28 2018 -0800 > > > > userfaultfd: shmem: allocate anonymous memory for MAP_PRIVATE shmem > > > > Are you suggesting we should merge it somehow? I'd appreciate some > > concrete example here if so. > > What I am saying is that the containing function, mfill_atomic_pte(), > should be absorbed into each memory type's ->uffd_copy(), at least > partially. > > Shouldn't each memory type do all the necessary checks in ->uffd_copy()? > > To put it another way, no shared memory vma will enter the if() > statement, so why are we checking if they need to? > > So if the default uffd_copy() does the if (!shared) stuff, you can just > call uffd_ops->uffd_copy() with out any check there, right? IIUC this is the only piece of technical side of discussion that was not addressed, so I'm replying explicitly to this one. Please let me know if I missed something else. Here we need to keep MAP_PRIVATE separate and as a common code to UFFDIO_COPY / ZEROCOPY. It not only applies to shmem even if it was about shmem only at that time when Andrea fixed this case, it was simply because hugetlbfs was working fine via its separate path. In general, shmem is the file system, and when it's mapped MAP_PRIVATE (and even if it's called shmem) we should bypass page cache, hence uffd_copy() shouldn't be used. It should apply to other file systems too when UFFDIO_COPY/ZEROCOPY will be implemented via ->uffd_copy(). However since we're not going to export ->uffd_copy() in the new version, it's also out of the picture. Thanks, -- Peter Xu
On 04/07/2025 20:39, Liam R. Howlett wrote: > * Peter Xu <peterx@redhat.com> [250704 11:00]: >> On Fri, Jul 04, 2025 at 11:34:15AM +0200, David Hildenbrand wrote: >>> On 03.07.25 19:48, Mike Rapoport wrote: >>>> On Wed, Jul 02, 2025 at 03:46:57PM -0400, Peter Xu wrote: >>>>> On Wed, Jul 02, 2025 at 08:39:32PM +0300, Mike Rapoport wrote: >>>>> >>>>> [...] >>>>> >>>>>>> The main target of this change is the implementation of UFFD for >>>>>>> KVM/guest_memfd (examples: [1], [2]) to avoid bringing KVM-specific code >>>>>>> into the mm codebase. We usually mean KVM by the "drivers" in this context, >>>>>>> and it is already somewhat "knowledgeable" of the mm. I don't think there >>>>>>> are existing use cases for other drivers to implement this at the moment. >>>>>>> >>>>>>> Although I can't see new exports in this series, there is now a way to limit >>>>>>> exports to particular modules [3]. Would it help if we only do it for KVM >>>>>>> initially (if/when actually needed)? >>>>>> >>>>>> There were talks about pulling out guest_memfd core into mm, but I don't >>>>>> remember patches about it. If parts of guest_memfd were already in mm/ that >>>>>> would make easier to export uffd ops to it. >>>>> >>>>> Do we have a link to such discussion? I'm also curious whether that idea >>>>> was acknowledged by KVM maintainers. >>>> >>>> AFAIR it was discussed at one of David's guest_memfd calls >>> >>> While it was discussed in the call a couple of times in different context >>> (guest_memfd as a library / guest_memfd shim), I think we already discussed >>> it back at LPC last year. >>> >>> One of the main reasons for doing that is supporting guest_memfd in other >>> hypervisors -- the gunyah hypervisor in the kernel wants to make use of it >>> as well. >> >> I see, thanks for the info. I found the series, it's here: >> >> https://lore.kernel.org/all/20241113-guestmem-library-v3-0-71fdee85676b@quicinc.com/ >> >> Here, the question is whether do we still want to keep explicit calls to >> shmem, hugetlbfs and in the future, guest-memfd. The library-ize of >> guest-memfd doesn't change a huge lot on answering this question, IIUC. > > Can you explore moving hugetlb_mfill_atomic_pte and > shmem_mfill_atomic_pte into mm/userfaultfd.c and generalizing them to > use the same code? > > That is, split the similar blocks into functions and reduce duplication. > > These are under the UFFD config option and are pretty similar. This > will also limit the bleeding of mfill_atomic_mode out of uffd. > > > > If you look at what the code does in userfaultfd.c, you can see that > some refactoring is necessary for other reasons: > > mfill_atomic() calls mfill_atomic_hugetlb(), or it enters a while > (src_addr < src_start + len) to call mfill_atomic_pte().. which might > call shmem_mfill_atomic_pte() in mm/shmem.c > > mfill_atomic_hugetlb() calls, in a while (src_addr < src_start + len) > loop and calls hugetlb_mfill_atomic_pte() in mm/hugetlb.c > > The shmem call already depends on the vma flags.. which it still does in > your patch 4 here. So you've replaced this: > > if (!(dst_vma->vm_flags & VM_SHARED)) { > ... > } else { > shmem_mfill_atomic_pte() > } > > With... > > if (!(dst_vma->vm_flags & VM_SHARED)) { > ... > } else { > ... > uffd_ops->uffd_copy() > } > > So, really, what needs to happen first is userfaultfd needs to be > sorted. > > There's no point of creating a vm_ops_uffd if it will just serve as > replacing the call locations of the functions like this, as it has done > nothing to simplify the logic. > >> However if we want to generalize userfaultfd capability for a type of >> memory, we will still need something like the vm_uffd_ops hook to report >> such information. It means drivers can still overwrite these, with/without >> an exported mfill_atomic_install_pte() functions. I'm not sure whether >> that eases the concern. > > If we work through the duplication and reduction where possible, the > path forward may be easier to see. > >> >> So to me, generalizing the mem type looks helpful with/without moving >> guest-memfd under mm/. > > Yes, it should decrease the duplication across hugetlb.c and shmem.c, > but I think that userfaultfd is the place to start. > >> >> We do have the option to keep hard-code guest-memfd like shmem or >> hugetlbfs. This is still "doable", but this likely means guest-memfd >> support for userfaultfd needs to be done after that work. I did quickly >> check the status of gunyah hypervisor [1,2,3], I found that all of the >> efforts are not yet continued in 2025. The hypervisor last update was Jan >> 2024 with a batch push [1]. >> >> I still prefer generalizing uffd capabilities using the ops. That makes >> guest-memfd support on MINOR not be blocked and it should be able to be >> done concurrently v.s. guest-memfd library. If guest-memfd library idea >> didn't move on, it's non-issue either. >> >> I've considered dropping uffd_copy() and MISSING support for vm_uffd_ops if >> I'm going to repost - that looks like the only thing that people are >> against with, even though that is not my preference, as that'll make the >> API half-broken on its own. > > The generalisation you did does not generalize much, as I pointed out > above, and so it seems less impactful than it could be. > > These patches also do not explore what this means for guest_memfd. So > it is not clear that the expected behaviour will serve the need. > > You sent a link to an example user. Can you please keep this work > together in the patch set so that we know it'll work for your use case > and allows us an easier way to pull down this work so we can examine it. Hi Liam, Lorenzo, With mmap support in guest_memfd being recently accepted and merged into kvm/next [1], UFFDIO_CONTINUE support in guest_memfd becomes a real use case. From what I understand, it is the API for UFFDIO_COPY (ie the .uffd_copy callback) proposed by Peter that raises the safery issues, while the generalisation of the checks (.uffd_features, .uffd_ioctls) and .uffd_get_folio needed for UFFDIO_CONTINUE do not introduce such concerns. In order to unblock the userfaultfd support in guest_memfd, would it be acceptable to start with implementing .uffd_get_folio/UFFDIO_CONTINUE only, leaving the callback for UFFDIO_COPY for later when we have an idea about a safer API and a clear use case for that? If that's the case, what would be the best way to demonstrate the client code? Would using kvm/next as the base for the aggregated series (new mm API + client code in kvm/guest_memfd) work? Thanks, Nikita [1]: https://lore.kernel.org/kvm/20250729225455.670324-1-seanjc@google.com/ > > Alternatively, you can reduce and combine the memory types without > exposing the changes externally, if they stand on their own. But I > don't think anyone is going to accept using a vm_ops change where a > direct function call could be used. > >> Said that, I still prefer this against >> hard-code and/or use CONFIG_GUESTMEM in userfaultfd code. > > It also does nothing with regards to the CONFIG_USERFAULTFD in other > areas. My hope is that you could pull out the common code and make the > CONFIG_ sections smaller. > > And, by the way, it isn't the fact that we're going to copy something > (or mfill_atomic it, I guess?) but the fact that we're handing out the > pointer for something else to do that. Please handle this manipulation > in the core mm code without handing out pointers to folios, or page > tables. > > You could do this with the address being passed in and figure out the > type, or even a function pointer that you specifically pass in an enum > of the type (I think this is what Lorenzo was suggesting somewhere), > maybe with multiple flags for actions and fallback (MFILL|ZERO for > example). > > Thanks, > Liam >
* Nikita Kalyazin <kalyazin@amazon.com> [250901 12:01]: > > > On 04/07/2025 20:39, Liam R. Howlett wrote: > > * Peter Xu <peterx@redhat.com> [250704 11:00]: > > > On Fri, Jul 04, 2025 at 11:34:15AM +0200, David Hildenbrand wrote: > > > > On 03.07.25 19:48, Mike Rapoport wrote: > > > > > On Wed, Jul 02, 2025 at 03:46:57PM -0400, Peter Xu wrote: > > > > > > On Wed, Jul 02, 2025 at 08:39:32PM +0300, Mike Rapoport wrote: > > > > > > > > > > > > [...] > > > > > > > > > > > > > > The main target of this change is the implementation of UFFD for > > > > > > > > KVM/guest_memfd (examples: [1], [2]) to avoid bringing KVM-specific code > > > > > > > > into the mm codebase. We usually mean KVM by the "drivers" in this context, > > > > > > > > and it is already somewhat "knowledgeable" of the mm. I don't think there > > > > > > > > are existing use cases for other drivers to implement this at the moment. > > > > > > > > > > > > > > > > Although I can't see new exports in this series, there is now a way to limit > > > > > > > > exports to particular modules [3]. Would it help if we only do it for KVM > > > > > > > > initially (if/when actually needed)? > > > > > > > > > > > > > > There were talks about pulling out guest_memfd core into mm, but I don't > > > > > > > remember patches about it. If parts of guest_memfd were already in mm/ that > > > > > > > would make easier to export uffd ops to it. > > > > > > > > > > > > Do we have a link to such discussion? I'm also curious whether that idea > > > > > > was acknowledged by KVM maintainers. > > > > > > > > > > AFAIR it was discussed at one of David's guest_memfd calls > > > > > > > > While it was discussed in the call a couple of times in different context > > > > (guest_memfd as a library / guest_memfd shim), I think we already discussed > > > > it back at LPC last year. > > > > > > > > One of the main reasons for doing that is supporting guest_memfd in other > > > > hypervisors -- the gunyah hypervisor in the kernel wants to make use of it > > > > as well. > > > > > > I see, thanks for the info. I found the series, it's here: > > > > > > https://lore.kernel.org/all/20241113-guestmem-library-v3-0-71fdee85676b@quicinc.com/ > > > > > > Here, the question is whether do we still want to keep explicit calls to > > > shmem, hugetlbfs and in the future, guest-memfd. The library-ize of > > > guest-memfd doesn't change a huge lot on answering this question, IIUC. > > > > Can you explore moving hugetlb_mfill_atomic_pte and > > shmem_mfill_atomic_pte into mm/userfaultfd.c and generalizing them to > > use the same code? > > > > That is, split the similar blocks into functions and reduce duplication. > > > > These are under the UFFD config option and are pretty similar. This > > will also limit the bleeding of mfill_atomic_mode out of uffd. > > > > > > > > If you look at what the code does in userfaultfd.c, you can see that > > some refactoring is necessary for other reasons: > > > > mfill_atomic() calls mfill_atomic_hugetlb(), or it enters a while > > (src_addr < src_start + len) to call mfill_atomic_pte().. which might > > call shmem_mfill_atomic_pte() in mm/shmem.c > > > > mfill_atomic_hugetlb() calls, in a while (src_addr < src_start + len) > > loop and calls hugetlb_mfill_atomic_pte() in mm/hugetlb.c > > > > The shmem call already depends on the vma flags.. which it still does in > > your patch 4 here. So you've replaced this: > > > > if (!(dst_vma->vm_flags & VM_SHARED)) { > > ... > > } else { > > shmem_mfill_atomic_pte() > > } > > > > With... > > > > if (!(dst_vma->vm_flags & VM_SHARED)) { > > ... > > } else { > > ... > > uffd_ops->uffd_copy() > > } > > > > So, really, what needs to happen first is userfaultfd needs to be > > sorted. > > > > There's no point of creating a vm_ops_uffd if it will just serve as > > replacing the call locations of the functions like this, as it has done > > nothing to simplify the logic. > > > > > However if we want to generalize userfaultfd capability for a type of > > > memory, we will still need something like the vm_uffd_ops hook to report > > > such information. It means drivers can still overwrite these, with/without > > > an exported mfill_atomic_install_pte() functions. I'm not sure whether > > > that eases the concern. > > > > If we work through the duplication and reduction where possible, the > > path forward may be easier to see. > > > > > > > > So to me, generalizing the mem type looks helpful with/without moving > > > guest-memfd under mm/. > > > > Yes, it should decrease the duplication across hugetlb.c and shmem.c, > > but I think that userfaultfd is the place to start. > > > > > > > > We do have the option to keep hard-code guest-memfd like shmem or > > > hugetlbfs. This is still "doable", but this likely means guest-memfd > > > support for userfaultfd needs to be done after that work. I did quickly > > > check the status of gunyah hypervisor [1,2,3], I found that all of the > > > efforts are not yet continued in 2025. The hypervisor last update was Jan > > > 2024 with a batch push [1]. > > > > > > I still prefer generalizing uffd capabilities using the ops. That makes > > > guest-memfd support on MINOR not be blocked and it should be able to be > > > done concurrently v.s. guest-memfd library. If guest-memfd library idea > > > didn't move on, it's non-issue either. > > > > > > I've considered dropping uffd_copy() and MISSING support for vm_uffd_ops if > > > I'm going to repost - that looks like the only thing that people are > > > against with, even though that is not my preference, as that'll make the > > > API half-broken on its own. > > > > The generalisation you did does not generalize much, as I pointed out > > above, and so it seems less impactful than it could be. > > > > These patches also do not explore what this means for guest_memfd. So > > it is not clear that the expected behaviour will serve the need. > > > > You sent a link to an example user. Can you please keep this work > > together in the patch set so that we know it'll work for your use case > > and allows us an easier way to pull down this work so we can examine it. > > Hi Liam, Lorenzo, > > With mmap support in guest_memfd being recently accepted and merged into > kvm/next [1], UFFDIO_CONTINUE support in guest_memfd becomes a real use > case. > > From what I understand, it is the API for UFFDIO_COPY (ie the .uffd_copy > callback) proposed by Peter that raises the safery issues, while the > generalisation of the checks (.uffd_features, .uffd_ioctls) and > .uffd_get_folio needed for UFFDIO_CONTINUE do not introduce such concerns. > In order to unblock the userfaultfd support in guest_memfd, would it be > acceptable to start with implementing .uffd_get_folio/UFFDIO_CONTINUE only, > leaving the callback for UFFDIO_COPY for later when we have an idea about a > safer API and a clear use case for that? Reading through the patches, I'm not entirely sure what you are proposing. What I was hoping to see by a generalization of the memory types is a much simpler shared code base until the code hit memory type specific areas where a function pointer could be used to keep things from getting complicated (or, I guess a switch statement..). What we don't want is non-mm code specifying values for the function pointer and doing what they want, or a function pointer that returns a core mm resource (in the old example this was a vma, here it is a folio). From this patch set: + * Return: zero if succeeded, negative for errors. + */ + int (*uffd_get_folio)(struct inode *inode, pgoff_t pgoff, + struct folio **folio); This is one of the contention points in the current scenario as the folio would be returned. If you are going to be manipulating an mm resource it should be in mm code, especially if it requires mm locks. Thanks, Liam
Hi Liam, On Mon, Sep 08, 2025 at 12:53:37PM -0400, Liam R. Howlett wrote: > > Reading through the patches, I'm not entirely sure what you are > proposing. > > What I was hoping to see by a generalization of the memory types is a > much simpler shared code base until the code hit memory type specific > areas where a function pointer could be used to keep things from getting > complicated (or, I guess a switch statement..). > > What we don't want is non-mm code specifying values for the function > pointer and doing what they want, or a function pointer that returns a > core mm resource (in the old example this was a vma, here it is a > folio). > > From this patch set: > + * Return: zero if succeeded, negative for errors. > + */ > + int (*uffd_get_folio)(struct inode *inode, pgoff_t pgoff, > + struct folio **folio); > > This is one of the contention points in the current scenario as the > folio would be returned. I don't see a problem with it. It's not any different from vma_ops->fault(): a callback for a filesystem to get a folio that will be mapped afterwards by the mm code. -- Sincerely yours, Mike.
* Mike Rapoport <rppt@kernel.org> [250917 05:26]: > Hi Liam, > > On Mon, Sep 08, 2025 at 12:53:37PM -0400, Liam R. Howlett wrote: > > > > Reading through the patches, I'm not entirely sure what you are > > proposing. > > > > What I was hoping to see by a generalization of the memory types is a > > much simpler shared code base until the code hit memory type specific > > areas where a function pointer could be used to keep things from getting > > complicated (or, I guess a switch statement..). > > > > What we don't want is non-mm code specifying values for the function > > pointer and doing what they want, or a function pointer that returns a > > core mm resource (in the old example this was a vma, here it is a > > folio). > > > > From this patch set: > > + * Return: zero if succeeded, negative for errors. > > + */ > > + int (*uffd_get_folio)(struct inode *inode, pgoff_t pgoff, > > + struct folio **folio); > > > > This is one of the contention points in the current scenario as the > > folio would be returned. > > I don't see a problem with it. It's not any different from > vma_ops->fault(): a callback for a filesystem to get a folio that will be > mapped afterwards by the mm code. > I disagree, the filesystem vma_ops->fault() is not a config option like this one. So we are on a path to enable uffd by default, and it really needs work beyond this series. Setting up a list head and passing in through every call stack is far from idea. I also think the filesystem model is not one we want to duplicate in mm for memory types - think of the test issues we have now and then have a look at the xfstests support of filesystems [1]. So we are on a path of less test coverage, and more code that is actually about mm that is outside of mm. So, is there another way? Cheers, Liam [1]. https://github.com/kdave/xfstests/blob/master/README#L37
On Wed, Sep 17, 2025 at 12:53:05PM -0400, Liam R. Howlett wrote: > * Mike Rapoport <rppt@kernel.org> [250917 05:26]: > > Hi Liam, > > > > On Mon, Sep 08, 2025 at 12:53:37PM -0400, Liam R. Howlett wrote: > > > > > > Reading through the patches, I'm not entirely sure what you are > > > proposing. > > > > > > What I was hoping to see by a generalization of the memory types is a > > > much simpler shared code base until the code hit memory type specific > > > areas where a function pointer could be used to keep things from getting > > > complicated (or, I guess a switch statement..). > > > > > > What we don't want is non-mm code specifying values for the function > > > pointer and doing what they want, or a function pointer that returns a > > > core mm resource (in the old example this was a vma, here it is a > > > folio). > > > > > > From this patch set: > > > + * Return: zero if succeeded, negative for errors. > > > + */ > > > + int (*uffd_get_folio)(struct inode *inode, pgoff_t pgoff, > > > + struct folio **folio); > > > > > > This is one of the contention points in the current scenario as the > > > folio would be returned. > > > > I don't see a problem with it. It's not any different from > > vma_ops->fault(): a callback for a filesystem to get a folio that will be > > mapped afterwards by the mm code. > > > > I disagree, the filesystem vma_ops->fault() is not a config option like > this one. So we are on a path to enable uffd by default, and it really > needs work beyond this series. Setting up a list head and passing in > through every call stack is far from idea. I don't follow you here. How addition of uffd callbacks guarded by a config option to vma_ops leads to enabling uffd by by default? > I also think the filesystem model is not one we want to duplicate in mm > for memory types - think of the test issues we have now and then have a > look at the xfstests support of filesystems [1]. > > So we are on a path of less test coverage, and more code that is > actually about mm that is outside of mm. So, is there another way? There are quite a few vma_ops outside fs/ not covered by xfstest, so the test coverage argument is moot at best. And anything in the kernel can grab a folio and do whatever it pleases. Nevertheless, let's step back for a second and instead focus on the problem these patches are trying to solve, which is to allow guest_memfd implement UFFD_CONTINUE (or minor fault in other terminology). This means uffd should be able to map a folio that's already in guest_memfd page cache to the faulted address. Obviously, the page table update happens in uffd. But it still has to find what to map and we need some way to let guest_memfd tell that to uffd. So we need a hook somewhere that will return a folio matching pgoff in vma->file->inode. Do you see a way to implement it otherwise? -- Sincerely yours, Mike.
* Mike Rapoport <rppt@kernel.org> [250918 04:37]: > On Wed, Sep 17, 2025 at 12:53:05PM -0400, Liam R. Howlett wrote: > > * Mike Rapoport <rppt@kernel.org> [250917 05:26]: > > > Hi Liam, > > > > > > On Mon, Sep 08, 2025 at 12:53:37PM -0400, Liam R. Howlett wrote: > > > > > > > > Reading through the patches, I'm not entirely sure what you are > > > > proposing. > > > > > > > > What I was hoping to see by a generalization of the memory types is a > > > > much simpler shared code base until the code hit memory type specific > > > > areas where a function pointer could be used to keep things from getting > > > > complicated (or, I guess a switch statement..). > > > > > > > > What we don't want is non-mm code specifying values for the function > > > > pointer and doing what they want, or a function pointer that returns a > > > > core mm resource (in the old example this was a vma, here it is a > > > > folio). > > > > > > > > From this patch set: > > > > + * Return: zero if succeeded, negative for errors. > > > > + */ > > > > + int (*uffd_get_folio)(struct inode *inode, pgoff_t pgoff, > > > > + struct folio **folio); > > > > > > > > This is one of the contention points in the current scenario as the > > > > folio would be returned. > > > > > > I don't see a problem with it. It's not any different from > > > vma_ops->fault(): a callback for a filesystem to get a folio that will be > > > mapped afterwards by the mm code. > > > > > > > I disagree, the filesystem vma_ops->fault() is not a config option like > > this one. So we are on a path to enable uffd by default, and it really > > needs work beyond this series. Setting up a list head and passing in > > through every call stack is far from idea. > > I don't follow you here. How addition of uffd callbacks guarded by a config > option to vma_ops leads to enabling uffd by by default? Any new memory type that uses the above interface now needs uffd enabled, anyone planning to use guest_memfd needs it enabled, anyone able to get a module using this interface needs it enabled (by whoever gives them the kernel they use). Kernel provides now need to enable UFFD - which is different than the example provided. > > > I also think the filesystem model is not one we want to duplicate in mm > > for memory types - think of the test issues we have now and then have a > > look at the xfstests support of filesystems [1]. > > > > So we are on a path of less test coverage, and more code that is > > actually about mm that is outside of mm. So, is there another way? > > There are quite a few vma_ops outside fs/ not covered by xfstest, so the > test coverage argument is moot at best. > And anything in the kernel can grab a folio and do whatever it pleases. Testing filesystems is nothing short of a nightmare and I don't want mm to march happily towards that end. This interface is endlessly flexible and thus endlessly broken and working at the same time. IOW, we have given anyone wanting to implement a new memory type infinite freedoms to run afoul, but they won't be looking for those people when things go horribly wrong - they will most likely see a memory issue and come here. syzbot will see a hang on some mm lock in an unrelated task, or whatever. I would rather avoid the endlessly flexible interface to avoid incorrect uses in favour of a limited selection of choices, that could be expanded if necessary, but would be more visible to the mm people going in. That is, people can add new memory types through adding them to mm/ instead of in driver/ or out of tree. I could very much see someone looking to use this for a binder-type driver and that might work out really well! But I don't want someone doing it and shoving the folio pointer in a custom struct because they *know* it's fine, so what's the big deal? I don't mean to pick on binder, but this example comes to mind. > > Nevertheless, let's step back for a second and instead focus on the problem > these patches are trying to solve, which is to allow guest_memfd implement > UFFD_CONTINUE (or minor fault in other terminology). Well, this is about modularizing memory types, but the first user is supposed to be the guest-memfd support. > > This means uffd should be able to map a folio that's already in > guest_memfd page cache to the faulted address. Obviously, the page table > update happens in uffd. But it still has to find what to map and we need > some way to let guest_memfd tell that to uffd. > > So we need a hook somewhere that will return a folio matching pgoff in > vma->file->inode. > > Do you see a way to implement it otherwise? I must be missing something. UFFDIO_CONTINUE currently enters through an ioctl that calls userfaultfd_continue() -> mfill_atomic_continue()... mfill_atomic() gets and uses the folio to actually do the work. Right now, we don't hand out the folio, so what is different here? I am under the impression that we don't need to return the folio, but may need to do work on it. That is, we can give the mm side what it needs to call the related memory type functions to service the request. For example, one could pass in the inode, pgoff, and memory type and the mm code could then call the fault handler for that memory type? I didn't think Nikita had a folio returned in his first three patches [1], but then they built on other patches and it was difficult to follow along. Is it because that interface was agreed on in a call on 23 Jan 2025 [2], as somewhat unclearly stated in [1]? Thanks, Liam [1]. https://lore.kernel.org/all/20250404154352.23078-1-kalyazin@amazon.com/ [2]. https://docs.google.com/document/d/1M6766BzdY1Lhk7LiR5IqVR8B8mG3cr-cxTxOrAosPOk/edit?tab=t.0#heading=h.w1126rgli5e3
On Thu, Sep 18, 2025 at 12:47:41PM -0400, Liam R. Howlett wrote: > * Mike Rapoport <rppt@kernel.org> [250918 04:37]: > > On Wed, Sep 17, 2025 at 12:53:05PM -0400, Liam R. Howlett wrote: > > > > > > I disagree, the filesystem vma_ops->fault() is not a config option like > > > this one. So we are on a path to enable uffd by default, and it really > > > needs work beyond this series. Setting up a list head and passing in > > > through every call stack is far from idea. > > > > I don't follow you here. How addition of uffd callbacks guarded by a config > > option to vma_ops leads to enabling uffd by by default? > > Any new memory type that uses the above interface now needs uffd > enabled, anyone planning to use guest_memfd needs it enabled, anyone > able to get a module using this interface needs it enabled (by whoever > gives them the kernel they use). Kernel provides now need to enable > UFFD - which is different than the example provided. My understanding of Peter's suggestion is that *if* uffd is enabled memory type *may* implement the API, whatever API we'll come up with. > > Nevertheless, let's step back for a second and instead focus on the problem > > these patches are trying to solve, which is to allow guest_memfd implement > > UFFD_CONTINUE (or minor fault in other terminology). > > Well, this is about modularizing memory types, but the first user is > supposed to be the guest-memfd support. > > > > > This means uffd should be able to map a folio that's already in > > guest_memfd page cache to the faulted address. Obviously, the page table > > update happens in uffd. But it still has to find what to map and we need > > some way to let guest_memfd tell that to uffd. > > > > So we need a hook somewhere that will return a folio matching pgoff in > > vma->file->inode. > > > > Do you see a way to implement it otherwise? > > I must be missing something. > > UFFDIO_CONTINUE currently enters through an ioctl that calls > userfaultfd_continue() -> mfill_atomic_continue()... mfill_atomic() gets > and uses the folio to actually do the work. Right now, we don't hand > out the folio, so what is different here? The ioctl() is the mean of userspace to resolve a page fault and mfill_atomic() needs something similar to ->fault() to actually get the folio. And in case of shmem and guest_memfd the folio lives in the page cache. > I am under the impression that we don't need to return the folio, but > may need to do work on it. That is, we can give the mm side what it > needs to call the related memory type functions to service the request. > > For example, one could pass in the inode, pgoff, and memory type and the > mm code could then call the fault handler for that memory type? How calling the fault handler differs conceptually from calling uffd_get_folio? If you take a look at UFFD_CONTINUE for shmem, this is pretty much what's happening. uffd side finds inode and pgoff and calls to a shmem_get_folio() that's very much similar to shmem->fault(). -- Sincerely yours, Mike.
* Mike Rapoport <rppt@kernel.org> [250918 14:05]: ... > > > I am under the impression that we don't need to return the folio, but > > may need to do work on it. That is, we can give the mm side what it > > needs to call the related memory type functions to service the request. > > > > For example, one could pass in the inode, pgoff, and memory type and the > > mm code could then call the fault handler for that memory type? > > How calling the fault handler differs conceptually from calling > uffd_get_folio? > If you take a look at UFFD_CONTINUE for shmem, this is pretty much what's > happening. uffd side finds inode and pgoff and calls to a shmem_get_folio() > that's very much similar to shmem->fault(). I believe the location of the code that handles the folio. One would decouple the folio processing from the mm while the other would decouple which processing of the folio is done within the mm. Does that make sense?
On Thu, Sep 18, 2025 at 02:32:31PM -0400, Liam R. Howlett wrote: > * Mike Rapoport <rppt@kernel.org> [250918 14:05]: > > ... > > > > > > I am under the impression that we don't need to return the folio, but > > > may need to do work on it. That is, we can give the mm side what it > > > needs to call the related memory type functions to service the request. > > > > > > For example, one could pass in the inode, pgoff, and memory type and the > > > mm code could then call the fault handler for that memory type? > > > > How calling the fault handler differs conceptually from calling > > uffd_get_folio? > > If you take a look at UFFD_CONTINUE for shmem, this is pretty much what's > > happening. uffd side finds inode and pgoff and calls to a shmem_get_folio() > > that's very much similar to shmem->fault(). > > I believe the location of the code that handles the folio. One would > decouple the folio processing from the mm while the other would decouple > which processing of the folio is done within the mm. > > Does that make sense? No :) In short, uffd_get_folio() is a special case of ->fault(). tl;dr version is that the whole processing is page fault handling with a brief excursion to userspace. For VMAs registered with uffd, page faults are intercepted by uffd and delivered as events to userpsace. There are several ways for userspace to resolve a page fault, in this case we are talking about UFFD_CONTINUE. Its semantics is similar to a minor fault - if the faulted folio is already in the page cache of the address space backing the VMA, it is mapped into the page table. If the folio is not in the page cache uffd returns -EFAULT. So the processing of the folio that uffd_get_folio() is exactly what ->fault() would do for a folio found in the page cache of inode backing the VMA. And unlike ->fault(), uffd_get_folio() should not allocate a new folio if its not in the page cache. -- Sincerely yours, Mike.
On Thu, Sep 18, 2025 at 02:32:31PM -0400, Liam R. Howlett wrote: > I believe the location of the code that handles the folio. One would > decouple the folio processing from the mm while the other would decouple > which processing of the folio is done within the mm. > > Does that make sense? Not to me. do_fault() allows allocation and some other things (e.g. trappable), uffd_get_folio() doesn't. They're fundamentally exactly the same otherwise. -- Peter Xu
On 18/09/2025 17:47, Liam R. Howlett wrote: > * Mike Rapoport <rppt@kernel.org> [250918 04:37]: >> On Wed, Sep 17, 2025 at 12:53:05PM -0400, Liam R. Howlett wrote: >>> * Mike Rapoport <rppt@kernel.org> [250917 05:26]: >>>> Hi Liam, >>>> >>>> On Mon, Sep 08, 2025 at 12:53:37PM -0400, Liam R. Howlett wrote: >>>>> >>>>> Reading through the patches, I'm not entirely sure what you are >>>>> proposing. >>>>> >>>>> What I was hoping to see by a generalization of the memory types is a >>>>> much simpler shared code base until the code hit memory type specific >>>>> areas where a function pointer could be used to keep things from getting >>>>> complicated (or, I guess a switch statement..). >>>>> >>>>> What we don't want is non-mm code specifying values for the function >>>>> pointer and doing what they want, or a function pointer that returns a >>>>> core mm resource (in the old example this was a vma, here it is a >>>>> folio). >>>>> >>>>> From this patch set: >>>>> + * Return: zero if succeeded, negative for errors. >>>>> + */ >>>>> + int (*uffd_get_folio)(struct inode *inode, pgoff_t pgoff, >>>>> + struct folio **folio); >>>>> >>>>> This is one of the contention points in the current scenario as the >>>>> folio would be returned. >>>> >>>> I don't see a problem with it. It's not any different from >>>> vma_ops->fault(): a callback for a filesystem to get a folio that will be >>>> mapped afterwards by the mm code. >>>> >>> >>> I disagree, the filesystem vma_ops->fault() is not a config option like >>> this one. So we are on a path to enable uffd by default, and it really >>> needs work beyond this series. Setting up a list head and passing in >>> through every call stack is far from idea. >> >> I don't follow you here. How addition of uffd callbacks guarded by a config >> option to vma_ops leads to enabling uffd by by default? > > Any new memory type that uses the above interface now needs uffd > enabled, anyone planning to use guest_memfd needs it enabled, anyone > able to get a module using this interface needs it enabled (by whoever > gives them the kernel they use). Kernel provides now need to enable > UFFD - which is different than the example provided. > >> >>> I also think the filesystem model is not one we want to duplicate in mm >>> for memory types - think of the test issues we have now and then have a >>> look at the xfstests support of filesystems [1]. >>> >>> So we are on a path of less test coverage, and more code that is >>> actually about mm that is outside of mm. So, is there another way? >> >> There are quite a few vma_ops outside fs/ not covered by xfstest, so the >> test coverage argument is moot at best. >> And anything in the kernel can grab a folio and do whatever it pleases. > > Testing filesystems is nothing short of a nightmare and I don't want mm > to march happily towards that end. This interface is endlessly flexible > and thus endlessly broken and working at the same time. > > IOW, we have given anyone wanting to implement a new memory type > infinite freedoms to run afoul, but they won't be looking for those > people when things go horribly wrong - they will most likely see a > memory issue and come here. syzbot will see a hang on some mm lock in an > unrelated task, or whatever. > > I would rather avoid the endlessly flexible interface to avoid incorrect > uses in favour of a limited selection of choices, that could be expanded > if necessary, but would be more visible to the mm people going in. That > is, people can add new memory types through adding them to mm/ instead > of in driver/ or out of tree. > > I could very much see someone looking to use this for a binder-type > driver and that might work out really well! But I don't want someone > doing it and shoving the folio pointer in a custom struct because they > *know* it's fine, so what's the big deal? I don't mean to pick on > binder, but this example comes to mind. > >> >> Nevertheless, let's step back for a second and instead focus on the problem >> these patches are trying to solve, which is to allow guest_memfd implement >> UFFD_CONTINUE (or minor fault in other terminology). > > Well, this is about modularizing memory types, but the first user is > supposed to be the guest-memfd support. > >> >> This means uffd should be able to map a folio that's already in >> guest_memfd page cache to the faulted address. Obviously, the page table >> update happens in uffd. But it still has to find what to map and we need >> some way to let guest_memfd tell that to uffd. >> >> So we need a hook somewhere that will return a folio matching pgoff in >> vma->file->inode. >> >> Do you see a way to implement it otherwise? > > I must be missing something. > > UFFDIO_CONTINUE currently enters through an ioctl that calls > userfaultfd_continue() -> mfill_atomic_continue()... mfill_atomic() gets > and uses the folio to actually do the work. Right now, we don't hand > out the folio, so what is different here? > > I am under the impression that we don't need to return the folio, but > may need to do work on it. That is, we can give the mm side what it > needs to call the related memory type functions to service the request. > > For example, one could pass in the inode, pgoff, and memory type and the > mm code could then call the fault handler for that memory type? > > I didn't think Nikita had a folio returned in his first three patches > [1], but then they built on other patches and it was difficult to follow > along. Is it because that interface was agreed on in a call on 23 Jan > 2025 [2], as somewhat unclearly stated in [1]? I believe you can safely ignore what was discussed in [2] as it is irrelevant to this discussion. That was just reasoning why it was possible to use UserfaultFD for guest_memfd as opposed to inventing an alternative solution to handling faults in userspace. Regarding returning a folio, [1] was calling vm_ops->fault() in UserfaultFD code. The fault() itself gets a folio (at least in guest_memfd implementation [3]). Does it look like a preferable solution to you? The other patches it I was building on top were mmap support in guest_memfd [4], which is currently merged in kvm/next, and also part of [3]. [3] https://git.kernel.org/pub/scm/linux/kernel/git/david/linux.git/tree/virt/kvm/guest_memfd.c?id=911634bac3107b237dcd8fdcb6ac91a22741cbe7#n347 [4] https://lore.kernel.org/kvm/20250729225455.670324-1-seanjc@google.com > > Thanks, > Liam > > [1]. https://lore.kernel.org/all/20250404154352.23078-1-kalyazin@amazon.com/ > [2]. https://docs.google.com/document/d/1M6766BzdY1Lhk7LiR5IqVR8B8mG3cr-cxTxOrAosPOk/edit?tab=t.0#heading=h.w1126rgli5e3 >
* Nikita Kalyazin <kalyazin@amazon.com> [250918 13:16]: ... > > > > > > Nevertheless, let's step back for a second and instead focus on the problem > > > these patches are trying to solve, which is to allow guest_memfd implement > > > UFFD_CONTINUE (or minor fault in other terminology). > > > > Well, this is about modularizing memory types, but the first user is > > supposed to be the guest-memfd support. > > > > > > > > This means uffd should be able to map a folio that's already in > > > guest_memfd page cache to the faulted address. Obviously, the page table > > > update happens in uffd. But it still has to find what to map and we need > > > some way to let guest_memfd tell that to uffd. > > > > > > So we need a hook somewhere that will return a folio matching pgoff in > > > vma->file->inode. > > > > > > Do you see a way to implement it otherwise? > > > > I must be missing something. > > > > UFFDIO_CONTINUE currently enters through an ioctl that calls > > userfaultfd_continue() -> mfill_atomic_continue()... mfill_atomic() gets > > and uses the folio to actually do the work. Right now, we don't hand > > out the folio, so what is different here? > > > > I am under the impression that we don't need to return the folio, but > > may need to do work on it. That is, we can give the mm side what it > > needs to call the related memory type functions to service the request. > > > > For example, one could pass in the inode, pgoff, and memory type and the > > mm code could then call the fault handler for that memory type? > > > > I didn't think Nikita had a folio returned in his first three patches > > [1], but then they built on other patches and it was difficult to follow > > along. Is it because that interface was agreed on in a call on 23 Jan > > 2025 [2], as somewhat unclearly stated in [1]? > > I believe you can safely ignore what was discussed in [2] as it is > irrelevant to this discussion. That was just reasoning why it was possible > to use UserfaultFD for guest_memfd as opposed to inventing an alternative > solution to handling faults in userspace. > > Regarding returning a folio, [1] was calling vm_ops->fault() in UserfaultFD > code. The fault() itself gets a folio (at least in guest_memfd > implementation [3]). Does it look like a preferable solution to you? I think this answers my question.. but I want to be sure. Does that mean you were getting the folio and doing the work in uffd without returning the uffd? I tried to get those patches, but they didn't apply for me. What I want to do is limit the "memory type" that we support by restricting what is done to service the fault, and handle that in mm code (mm/uffd.c or whatever). What we get is more people using the same fault handler and thus more eyes and testing. Less code duplication. Unless there is a technical reason we need more flexibility? > > The other patches it I was building on top were mmap support in guest_memfd > [4], which is currently merged in kvm/next, and also part of [3]. Can we process it in the mm without returning the folio like the ioctl does today, or is there a technical reason that won't work? Thanks, Liam
On Thu, Sep 18, 2025 at 06:15:41PM +0100, Nikita Kalyazin wrote: > The other patches it I was building on top were mmap support in guest_memfd > [4], which is currently merged in kvm/next, and also part of [3]. > > [3] https://git.kernel.org/pub/scm/linux/kernel/git/david/linux.git/tree/virt/kvm/guest_memfd.c?id=911634bac3107b237dcd8fdcb6ac91a22741cbe7#n347 > [4] https://lore.kernel.org/kvm/20250729225455.670324-1-seanjc@google.com Note that the mmap implementation is changing to mmap_prepare. It's the case in point example of where we exposed an internal mm object (VMA) and experienced significant issues including embargoed critical zero-day security reports as a result. I took a quick look at [3] and it seems you're using it in a totally normal way which is completely fine, however, and it'll be very easily convertible to mmap_prepare. Cheers, Lorenzo
On 18.09.25 19:45, Lorenzo Stoakes wrote: > On Thu, Sep 18, 2025 at 06:15:41PM +0100, Nikita Kalyazin wrote: >> The other patches it I was building on top were mmap support in guest_memfd >> [4], which is currently merged in kvm/next, and also part of [3]. >> >> [3] https://git.kernel.org/pub/scm/linux/kernel/git/david/linux.git/tree/virt/kvm/guest_memfd.c?id=911634bac3107b237dcd8fdcb6ac91a22741cbe7#n347 >> [4] https://lore.kernel.org/kvm/20250729225455.670324-1-seanjc@google.com > > Note that the mmap implementation is changing to mmap_prepare. It's the > case in point example of where we exposed an internal mm object (VMA) and > experienced significant issues including embargoed critical zero-day > security reports as a result. > > I took a quick look at [3] and it seems you're using it in a totally normal > way which is completely fine, however, and it'll be very easily convertible > to mmap_prepare. Yes, we'll take care of that once it's (finally :D ) upstream! Re Nikita: If we could just reuse fault() for userfaultfd purposes, that might actually be pretty nice. -- Cheers David / dhildenb
On Thu, Sep 18, 2025 at 07:53:46PM +0200, David Hildenbrand wrote: > Re Nikita: If we could just reuse fault() for userfaultfd purposes, that > might actually be pretty nice. I commented on that. https://lore.kernel.org/all/aEiwHjl4tsUt98sh@x1.local/ That'll need to leak FAULT_FLAG_USERFAULT_CONTINUE which isn't necessary, make it extremely hard to know when to set the flag, and comlicates the fault path which isn't necessary. I think Mike's comment was spot on, that the new API is literally do_fault() for shmem, but only used in userfaultfd context so it's even an oneliner. I do not maintain mm, so above is only my two cents, so I don't make decisions. Personally I still prefer the current approach of keep the mm main fault path clean. Besides, this series also cleans up other places all over the places, the vm_uffd_ops is a most simplified version of description for a memory type. So IMHO it's beneficial in other aspects as well. If uffd_copy() is a concern, fine, we drop it. We don't plan to have more use of UFFDIO_COPY outside of the known three memory types after all. Thanks, -- Peter Xu
On 18.09.25 20:20, Peter Xu wrote: > On Thu, Sep 18, 2025 at 07:53:46PM +0200, David Hildenbrand wrote: >> Re Nikita: If we could just reuse fault() for userfaultfd purposes, that >> might actually be pretty nice. > > I commented on that. > > https://lore.kernel.org/all/aEiwHjl4tsUt98sh@x1.local/ > > That'll need to leak FAULT_FLAG_USERFAULT_CONTINUE which isn't necessary, > make it extremely hard to know when to set the flag, and comlicates the > fault path which isn't necessary. I agree that FAULT_FLAG_USERFAULT_CONTINUE would be a very weird thing to have. I was wondering whether it could be abstracted in a cleaner way, similar to what you described with the "NO_USERFAULT", but possibly taking it one step further (if possible ...). In your reply you also mentioned "whether we can also avoid reusing fault() but instead resolve the page faults using the vm_ops hook too". So it kind-of is a special type of page fault, but the question would be how that could be integrated more cleanly. And as you also point out, the question would be which other users it might really have. In GUP we achieve not triggering userfaultfd by not setting FAULT_FLAG_ALLOW_RETRY. But it's rather that not allowing to retry (drop+retake locks) makes it impossible to call into userfaultfd. So not sure if abstracting/reusing that would make sense. > > I think Mike's comment was spot on, that the new API is literally > do_fault() for shmem, but only used in userfaultfd context so it's even an > oneliner. Right, special fault handling for userfaultfd. > > I do not maintain mm, so above is only my two cents, so I don't make > decisions. Personally I still prefer the current approach of keep the mm > main fault path clean. Well, as clean as things like FAULT_FLAG_ALLOW_RETRY are. :) In any case, reading Liams reply, it sounded like that there is a path forward, so I don't particularly care, just wanted to mention what I had in mind regarding FAULT_FLAG_ALLOW_RETRY. -- Cheers David / dhildenb
On Mon, Sep 22, 2025 at 07:20:50PM +0200, David Hildenbrand wrote: > On 18.09.25 20:20, Peter Xu wrote: > > On Thu, Sep 18, 2025 at 07:53:46PM +0200, David Hildenbrand wrote: > > > Re Nikita: If we could just reuse fault() for userfaultfd purposes, that > > > might actually be pretty nice. > > > > I commented on that. > > > > https://lore.kernel.org/all/aEiwHjl4tsUt98sh@x1.local/ > > > > That'll need to leak FAULT_FLAG_USERFAULT_CONTINUE which isn't necessary, > > make it extremely hard to know when to set the flag, and comlicates the > > fault path which isn't necessary. > > I agree that FAULT_FLAG_USERFAULT_CONTINUE would be a very weird thing to > have. > > I was wondering whether it could be abstracted in a cleaner way, similar to > what you described with the "NO_USERFAULT", but possibly taking it one step > further (if possible ...). > > In your reply you also mentioned "whether we can also avoid reusing fault() > but instead resolve the page faults using the vm_ops hook too". > > So it kind-of is a special type of page fault, but the question would be how > that could be integrated more cleanly. And as you also point out, the > question would be which other users it might really have. > > In GUP we achieve not triggering userfaultfd by not setting > FAULT_FLAG_ALLOW_RETRY. > > But it's rather that not allowing to retry (drop+retake locks) makes it > impossible to call into userfaultfd. Right. > > So not sure if abstracting/reusing that would make sense. Direct reuse is unlikely. The current semantics of FAULT_FLAG_ALLOW_RETRY is very specific, meanwhile handle_userfaultfd() actually has code to warn already when it's not set.. In this context, jumping into handle_userfault() is already wrong because this is about a request to fetch the page cache without being trappable. So if there'll be a new flag, it'll need to bypass handle_userfault(). But then if we'll need a new flag anyway.. IMHO it'll still be cleaner with uffd_get_folio(). It then sticks together with the uffd description of the memory type when one wants to opt-in with MINOR faults, which should also explicitly invoked only in userfaultfd minor fault reslutions (hence, no chance of breaking any form of fault() either..). Thanks, -- Peter Xu
* Peter Xu <peterx@redhat.com> [250918 14:21]: > On Thu, Sep 18, 2025 at 07:53:46PM +0200, David Hildenbrand wrote: > > Re Nikita: If we could just reuse fault() for userfaultfd purposes, that > > might actually be pretty nice. > > I commented on that. > > https://lore.kernel.org/all/aEiwHjl4tsUt98sh@x1.local/ > > That'll need to leak FAULT_FLAG_USERFAULT_CONTINUE which isn't necessary, > make it extremely hard to know when to set the flag, and comlicates the > fault path which isn't necessary. > > I think Mike's comment was spot on, that the new API is literally > do_fault() for shmem, but only used in userfaultfd context so it's even an > oneliner. > > I do not maintain mm, so above is only my two cents, so I don't make > decisions. Personally I still prefer the current approach of keep the mm > main fault path clean. What we are trying to say is you can have a fault path that takes a type enum that calls into a function that does whatever you want. It can even live in mm/userfaultfd.c. It can then jump off to mm/guest-memfd.c which can contain super unique copying of memory if that's needed. That way driver/i_know_better_that_everyone.c or fs/stature.c don't decide they can register their uffd and do cool stuff that totally won't tank the system in random strange ways. Seriously, how many fault handlers are you expecting to have here? I'd be surprised if a lot of the code in these memory types isn't shared, but I guess if they are all over the kernel they'll just clone the code and bugs (like the arch code does, but with less of a reason). > Besides, this series also cleans up other places all over the places, the > vm_uffd_ops is a most simplified version of description for a memory type. 6 files changed, 207 insertions(+), 76 deletions(-) Can you please point me to which patch has clean up? The mm has uffd code _everywhere_, including mm/userfaultfd.c that jumps to fs/userfaultfd.c and back. Every entry has a LIST_HEAD(uf) [1] [2] [3] in it that is passed through the entire call stack in case it is needed [4] [5] [6] [7] [8]. It has if statements in core mm functions in the middle of loops [9]. It complicates error handling and has tricky locking [10] (full disclosure, I helped with the locking.. totally worth the complication). This is a seriously complicated feature. How is a generic callback that splits out into, probably 4?, functions the deal breaker here? How is leaking a flag the line that we won't cross? > So IMHO it's beneficial in other aspects as well. If uffd_copy() is a > concern, fine, we drop it. We don't plan to have more use of UFFDIO_COPY > outside of the known three memory types after all. EXACTLY! There are three memory types and we're going to the most flexible interface possible, with the most danger. With guest_memfd we're up to four functions we'd need. Why not keep the mm code in the mm and have four functions to choose from? If you want 5 we can always add another. Regards, Liam [1]. https://elixir.bootlin.com/linux/v6.17-rc6/source/mm/mmap.c#L122 [2]. https://elixir.bootlin.com/linux/v6.17-rc6/source/mm/vma.c#L3149 [3]. https://elixir.bootlin.com/linux/v6.17-rc6/source/mm/util.c#L572 [4]. https://elixir.bootlin.com/linux/v6.17-rc6/source/mm/mmap.c#L338 [5]. https://elixir.bootlin.com/linux/v6.17-rc6/source/mm/mmap.c#L1063 [6]. https://elixir.bootlin.com/linux/v6.17-rc6/source/mm/vma.c#L1478 [7]. https://elixir.bootlin.com/linux/v6.17-rc6/source/mm/vma.c#L1517 [8]. https://elixir.bootlin.com/linux/v6.17-rc6/source/mm/vma.c#L1563 [9]. https://elixir.bootlin.com/linux/v6.17-rc6/source/mm/vma.c#L1407 [10]. https://elixir.bootlin.com/linux/v6.17-rc6/source/mm/userfaultfd.c#L69
On Thu, Sep 18, 2025 at 03:43:34PM -0400, Liam R. Howlett wrote: > * Peter Xu <peterx@redhat.com> [250918 14:21]: > > On Thu, Sep 18, 2025 at 07:53:46PM +0200, David Hildenbrand wrote: > > > Re Nikita: If we could just reuse fault() for userfaultfd purposes, that > > > might actually be pretty nice. > > > > I commented on that. > > > > https://lore.kernel.org/all/aEiwHjl4tsUt98sh@x1.local/ [1] > > > > That'll need to leak FAULT_FLAG_USERFAULT_CONTINUE which isn't necessary, > > make it extremely hard to know when to set the flag, and comlicates the > > fault path which isn't necessary. > > > > I think Mike's comment was spot on, that the new API is literally > > do_fault() for shmem, but only used in userfaultfd context so it's even an > > oneliner. > > > > I do not maintain mm, so above is only my two cents, so I don't make > > decisions. Personally I still prefer the current approach of keep the mm > > main fault path clean. > > What we are trying to say is you can have a fault path that takes a type > enum that calls into a function that does whatever you want. It can > even live in mm/userfaultfd.c. It can then jump off to mm/guest-memfd.c > which can contain super unique copying of memory if that's needed. Per mentioning of mm/guest-memfd.c, are you suggesting to take guest-memfd library approach? We have in total of at least three proposals: (a) https://lore.kernel.org/all/20250404154352.23078-1-kalyazin@amazon.com/ (b) this one (c) https://lore.kernel.org/all/20250915161815.40729-1-kalyazin@amazon.com/ I reviewd (a) and (c) and I provided my comments. If you prefer the library approach, feel free to reply directly to (c) thread against my email. I chose (b), from when it was posted. > > That way driver/i_know_better_that_everyone.c or fs/stature.c don't > decide they can register their uffd and do cool stuff that totally won't > tank the system in random strange ways. What is the difference if they are allowed to register ->fault() and tank the system? > > Seriously, how many fault handlers are you expecting to have here? First of all, it's not about "how many". We can assume one user as of now. Talking about any future user doesn't really help. The choice I made above on (b) is the best solution I think, with any known possible users. The plan might change, when more use cases pops up. However we can only try to make a fair decision with the current status quo. OTOH, the vm_uffd_ops also provides other fields (besides uffd_*() hooks). I wouldn't be surprised if a driver wants to opt-in with some of the fields with zero hooks attached at all, when an userfaultfd feature is automatically friendly to all kinds of memory types. Consider one VMA that sets UFFDIO_WRITEPROTECT but without most of the rest. As I discussed, IMHO (b) is the clean way to describe userfaultfd demand for any memory type. > > I'd be surprised if a lot of the code in these memory types isn't > shared, but I guess if they are all over the kernel they'll just clone > the code and bugs (like the arch code does, but with less of a reason). > > > Besides, this series also cleans up other places all over the places, the > > vm_uffd_ops is a most simplified version of description for a memory type. > > 6 files changed, 207 insertions(+), 76 deletions(-) > > Can you please point me to which patch has clean up? Patch 4. If you want me to explain every change I touched that is a cleanup, I can go into details. Maybe it's faster if you read them, it's not a huge patch. > > The mm has uffd code _everywhere_, including mm/userfaultfd.c that jumps > to fs/userfaultfd.c and back. Every entry has a LIST_HEAD(uf) [1] [2] > [3] in it that is passed through the entire call stack in case it is > needed [4] [5] [6] [7] [8]. It has if statements in core mm functions > in the middle of loops [9]. It complicates error handling and has > tricky locking [10] (full disclosure, I helped with the locking.. > totally worth the complication). > > This is a seriously complicated feature. I know you're the expert of VMA, you worked a lot of it. I understand those things can caused you frustrations when touching those codes. Though please let's do not bring those into reviewing this series. Those have nothing to do with this series. Frankly, I also wished if one day we can get rid of some of them. It really depends on how many users are there for the uffd events besides the generic ones (fault traps). > > How is a generic callback that splits out into, probably 4?, functions > the deal breaker here? > > How is leaking a flag the line that we won't cross? > > > So IMHO it's beneficial in other aspects as well. If uffd_copy() is a > > concern, fine, we drop it. We don't plan to have more use of UFFDIO_COPY > > outside of the known three memory types after all. > > EXACTLY! There are three memory types and we're going to the most > flexible interface possible, with the most danger. With guest_memfd > we're up to four functions we'd need. Why not keep the mm code in the > mm and have four functions to choose from? If you want 5 we can always > add another. I assume for most of the rest comments, you're suggesting the library approach. If so, again, I suggest we discuss explicitly in that thread. The library code may (or may not) be useful for other purposes. For the support of userfaultfd, it definitely doesn't need to be a dependency. OTOH, we may still want some abstraction like this one with/without the library. If so, I don't really see why we need to pushback on this. AFAIU, the only concern (after drop uffd_copy) is we'll expose uffd_get_folio(). Please help explain why ->fault() isn't worse. If we accepted ->fault() for all these years, I don't see a reason we should reject ->uffd_get_folio(), especially one of the goals is to keep the core mm path clean, per my comment to proposal (a). -- Peter Xu
* Peter Xu <peterx@redhat.com> [250918 17:07]: > On Thu, Sep 18, 2025 at 03:43:34PM -0400, Liam R. Howlett wrote: > > * Peter Xu <peterx@redhat.com> [250918 14:21]: > > > On Thu, Sep 18, 2025 at 07:53:46PM +0200, David Hildenbrand wrote: > > > > Re Nikita: If we could just reuse fault() for userfaultfd purposes, that > > > > might actually be pretty nice. > > > > > > I commented on that. > > > > > > https://lore.kernel.org/all/aEiwHjl4tsUt98sh@x1.local/ > > [1] > > > > > > > That'll need to leak FAULT_FLAG_USERFAULT_CONTINUE which isn't necessary, > > > make it extremely hard to know when to set the flag, and comlicates the > > > fault path which isn't necessary. > > > > > > I think Mike's comment was spot on, that the new API is literally > > > do_fault() for shmem, but only used in userfaultfd context so it's even an > > > oneliner. > > > > > > I do not maintain mm, so above is only my two cents, so I don't make > > > decisions. Personally I still prefer the current approach of keep the mm > > > main fault path clean. > > > > What we are trying to say is you can have a fault path that takes a type > > enum that calls into a function that does whatever you want. It can > > even live in mm/userfaultfd.c. It can then jump off to mm/guest-memfd.c > > which can contain super unique copying of memory if that's needed. > > Per mentioning of mm/guest-memfd.c, are you suggesting to take guest-memfd > library approach? > We have in total of at least three proposals: > > (a) https://lore.kernel.org/all/20250404154352.23078-1-kalyazin@amazon.com/ > (b) this one > (c) https://lore.kernel.org/all/20250915161815.40729-1-kalyazin@amazon.com/ > > I reviewd (a) and (c) and I provided my comments. If you prefer the > library approach, feel free to reply directly to (c) thread against my > email. > > I chose (b), from when it was posted. Honestly, I don't know what I'd vote for because I don't like any of them. I'd chose (d) the do nothing option. > > > > > > That way driver/i_know_better_that_everyone.c or fs/stature.c don't > > decide they can register their uffd and do cool stuff that totally won't > > tank the system in random strange ways. > > What is the difference if they are allowed to register ->fault() and tank > the system? One less problem. More people with mm experience looking at the handling of folios. The common code not being cloned and kept up to date when an issue in the original is discovered. Having to only update a few fault handlers when there is a folio or other mm change. Hopefully better testing? > > > > Seriously, how many fault handlers are you expecting to have here? > > First of all, it's not about "how many". We can assume one user as of now. > Talking about any future user doesn't really help. The choice I made above > on (b) is the best solution I think, with any known possible users. The > plan might change, when more use cases pops up. However we can only try to > make a fair decision with the current status quo. Planning to handle one, five, or two billion makes a difference in what you do. Your plan right now enables everyone to do whatever they want, where they want. I don't think we need this sort of flexibility with the limited number of users? > > OTOH, the vm_uffd_ops also provides other fields (besides uffd_*() hooks). > I wouldn't be surprised if a driver wants to opt-in with some of the fields > with zero hooks attached at all, when an userfaultfd feature is > automatically friendly to all kinds of memory types. > > Consider one VMA that sets UFFDIO_WRITEPROTECT but without most of the > rest. > > As I discussed, IMHO (b) is the clean way to describe userfaultfd demand > for any memory type. > > > > > I'd be surprised if a lot of the code in these memory types isn't > > shared, but I guess if they are all over the kernel they'll just clone > > the code and bugs (like the arch code does, but with less of a reason). > > > > > Besides, this series also cleans up other places all over the places, the > > > vm_uffd_ops is a most simplified version of description for a memory type. > > > > 6 files changed, 207 insertions(+), 76 deletions(-) > > > > Can you please point me to which patch has clean up? > > Patch 4. If you want me to explain every change I touched that is a > cleanup, I can go into details. Maybe it's faster if you read them, it's > not a huge patch. I responded here [1]. I actually put a lot of effort into that response and took a lot of time to dig into some of this to figure out if it was possible, and suggested some ideas. That was back in July, so the details aren't that fresh anymore. Maybe you missed my reply? I was hoping that, if the code was cleaned up, a solution may be more clear. I think the ops idea has a lot of positives. I also think you don't need to return a folio pointer to make it work. > > > > How is a generic callback that splits out into, probably 4?, functions > > the deal breaker here? > > > > How is leaking a flag the line that we won't cross? > > > > > So IMHO it's beneficial in other aspects as well. If uffd_copy() is a > > > concern, fine, we drop it. We don't plan to have more use of UFFDIO_COPY > > > outside of the known three memory types after all. > > > > EXACTLY! There are three memory types and we're going to the most > > flexible interface possible, with the most danger. With guest_memfd > > we're up to four functions we'd need. Why not keep the mm code in the > > mm and have four functions to choose from? If you want 5 we can always > > add another. > > I assume for most of the rest comments, you're suggesting the library > approach. If so, again, I suggest we discuss explicitly in that thread. > > The library code may (or may not) be useful for other purposes. For the > support of userfaultfd, it definitely doesn't need to be a dependency. > OTOH, we may still want some abstraction like this one with/without the > library. If so, I don't really see why we need to pushback on this. > > AFAIU, the only concern (after drop uffd_copy) is we'll expose > uffd_get_folio(). If you read my response [1], then you can see that I find the ops underutilized and lacks code simplification. > Please help explain why ->fault() isn't worse. I'm not sure it is worse, but I don't think it's necessary to return a folio for 4 users. And I think it could be better if we handled the operations on the folio internally, if at all possible. > > If we accepted ->fault() for all these years, I don't see a reason we > should reject ->uffd_get_folio(), especially one of the goals is to keep > the core mm path clean, per my comment to proposal (a). I see this argument as saying "there's a hole in our boat so why can't I make another?" It's not the direction we have to go to get what we need right now, so why are we doing it? Like you said, it can be evaluated later if things change.. My thoughts were around an idea that we only really need to do a limited number of operations on that pointer you are returning. Those operations may share code, and could be internal to mm. I don't see this as (a), (b), or (c), but maybe an addition to (b)? Maybe we need more ops to cover the uses? So, I think I do want the vm_uffd_ops idea, but not as it is written right now. Thanks, Liam [1]. https://lore.kernel.org/all/e7vr62s73dftijeveyg6lfgivctijz4qcar3teswjbuv6gog3k@4sbpuj35nbbh/
On Thu, Sep 18, 2025 at 09:50:49PM -0400, Liam R. Howlett wrote: > * Peter Xu <peterx@redhat.com> [250918 17:07]: > > On Thu, Sep 18, 2025 at 03:43:34PM -0400, Liam R. Howlett wrote: > > > * Peter Xu <peterx@redhat.com> [250918 14:21]: > > > > On Thu, Sep 18, 2025 at 07:53:46PM +0200, David Hildenbrand wrote: > > > > > Re Nikita: If we could just reuse fault() for userfaultfd purposes, that > > > > > might actually be pretty nice. > > > > > > > > I commented on that. > > > > > > > > https://lore.kernel.org/all/aEiwHjl4tsUt98sh@x1.local/ > > > > [1] > > > > > > > > > > That'll need to leak FAULT_FLAG_USERFAULT_CONTINUE which isn't necessary, > > > > make it extremely hard to know when to set the flag, and comlicates the > > > > fault path which isn't necessary. > > > > > > > > I think Mike's comment was spot on, that the new API is literally > > > > do_fault() for shmem, but only used in userfaultfd context so it's even an > > > > oneliner. > > > > > > > > I do not maintain mm, so above is only my two cents, so I don't make > > > > decisions. Personally I still prefer the current approach of keep the mm > > > > main fault path clean. > > > > > > What we are trying to say is you can have a fault path that takes a type > > > enum that calls into a function that does whatever you want. It can > > > even live in mm/userfaultfd.c. It can then jump off to mm/guest-memfd.c > > > which can contain super unique copying of memory if that's needed. > > > > Per mentioning of mm/guest-memfd.c, are you suggesting to take guest-memfd > > library approach? > > > We have in total of at least three proposals: > > > > (a) https://lore.kernel.org/all/20250404154352.23078-1-kalyazin@amazon.com/ > > (b) this one > > (c) https://lore.kernel.org/all/20250915161815.40729-1-kalyazin@amazon.com/ > > > > I reviewd (a) and (c) and I provided my comments. If you prefer the > > library approach, feel free to reply directly to (c) thread against my > > email. > > > > I chose (b), from when it was posted. > > Honestly, I don't know what I'd vote for because I don't like any of > them. I'd chose (d) the do nothing option. > > > > > > > > > > > That way driver/i_know_better_that_everyone.c or fs/stature.c don't > > > decide they can register their uffd and do cool stuff that totally won't > > > tank the system in random strange ways. > > > > What is the difference if they are allowed to register ->fault() and tank > > the system? > > One less problem. > > More people with mm experience looking at the handling of folios. > > The common code not being cloned and kept up to date when an issue in > the original is discovered. > > Having to only update a few fault handlers when there is a folio or > other mm change. > > Hopefully better testing? > > > > > > > Seriously, how many fault handlers are you expecting to have here? > > > > First of all, it's not about "how many". We can assume one user as of now. > > Talking about any future user doesn't really help. The choice I made above > > on (b) is the best solution I think, with any known possible users. The > > plan might change, when more use cases pops up. However we can only try to > > make a fair decision with the current status quo. > > Planning to handle one, five, or two billion makes a difference in what > you do. Your plan right now enables everyone to do whatever they want, > where they want. I don't think we need this sort of flexibility with > the limited number of users? > > > > > OTOH, the vm_uffd_ops also provides other fields (besides uffd_*() hooks). > > I wouldn't be surprised if a driver wants to opt-in with some of the fields > > with zero hooks attached at all, when an userfaultfd feature is > > automatically friendly to all kinds of memory types. > > > > Consider one VMA that sets UFFDIO_WRITEPROTECT but without most of the > > rest. > > > > As I discussed, IMHO (b) is the clean way to describe userfaultfd demand > > for any memory type. > > > > > > > > I'd be surprised if a lot of the code in these memory types isn't > > > shared, but I guess if they are all over the kernel they'll just clone > > > the code and bugs (like the arch code does, but with less of a reason). > > > > > > > Besides, this series also cleans up other places all over the places, the > > > > vm_uffd_ops is a most simplified version of description for a memory type. > > > > > > 6 files changed, 207 insertions(+), 76 deletions(-) > > > > > > Can you please point me to which patch has clean up? > > > > Patch 4. If you want me to explain every change I touched that is a > > cleanup, I can go into details. Maybe it's faster if you read them, it's > > not a huge patch. > > I responded here [1]. I actually put a lot of effort into that response > and took a lot of time to dig into some of this to figure out if it was > possible, and suggested some ideas. > > That was back in July, so the details aren't that fresh anymore. Maybe > you missed my reply? AFAICT, we made it the other way round. My reply is here: https://lore.kernel.org/all/aMnAscxj_h42wOAC@x1.local/ > > I was hoping that, if the code was cleaned up, a solution may be more > clear. > > I think the ops idea has a lot of positives. I also think you don't > need to return a folio pointer to make it work. > > > > > > > How is a generic callback that splits out into, probably 4?, functions > > > the deal breaker here? > > > > > > How is leaking a flag the line that we won't cross? > > > > > > > So IMHO it's beneficial in other aspects as well. If uffd_copy() is a > > > > concern, fine, we drop it. We don't plan to have more use of UFFDIO_COPY > > > > outside of the known three memory types after all. > > > > > > EXACTLY! There are three memory types and we're going to the most > > > flexible interface possible, with the most danger. With guest_memfd > > > we're up to four functions we'd need. Why not keep the mm code in the > > > mm and have four functions to choose from? If you want 5 we can always > > > add another. > > > > I assume for most of the rest comments, you're suggesting the library > > approach. If so, again, I suggest we discuss explicitly in that thread. > > > > The library code may (or may not) be useful for other purposes. For the > > support of userfaultfd, it definitely doesn't need to be a dependency. > > OTOH, we may still want some abstraction like this one with/without the > > library. If so, I don't really see why we need to pushback on this. > > > > AFAIU, the only concern (after drop uffd_copy) is we'll expose > > uffd_get_folio(). > > If you read my response [1], then you can see that I find the ops > underutilized and lacks code simplification. I tried to explain to you on why what you mentioned is completely orthogonal to this change, in above my reply. > > > Please help explain why ->fault() isn't worse. > > I'm not sure it is worse, but I don't think it's necessary to return a > folio for 4 users. And I think it could be better if we handled the > operations on the folio internally, if at all possible. > > > > > If we accepted ->fault() for all these years, I don't see a reason we > > should reject ->uffd_get_folio(), especially one of the goals is to keep > > the core mm path clean, per my comment to proposal (a). > > I see this argument as saying "there's a hole in our boat so why can't I > make another?" It's not the direction we have to go to get what we need > right now, so why are we doing it? Like you said, it can be evaluated > later if things change.. You described ->fault() as "a hole in our boat"? I'm astonished and do not know what to say on this. There was a great comment saying one may want to make Linux an unikernel. I thought it was a good one, but only when it was a joke. Is it not? > > My thoughts were around an idea that we only really need to do a limited > number of operations on that pointer you are returning. Those > operations may share code, and could be internal to mm. I don't see > this as (a), (b), or (c), but maybe an addition to (b)? Maybe we need > more ops to cover the uses? That's exactly what this proposal is about, isn't it? Userfaultfd minor fault shares almost all the code except the one hook fetching a folio from a page cache from the memory type. "could be internal to mm" is (c) at least. No one can do what you mentioned without moving guest-memfd into mm/ first. Nikita and I drafted these changes, so likely we may likely have better idea what is happening. Would you perhaps implement your idea, if that's better? Either you're right, we're happy to use it. Or you found what you're missing. It's not required, but now I think it might be a good idea if you are so strongly pushing back this userfaultfd feature. I hope it's fair we request for a better solution from you when you rejected almost every solution. > > So, I think I do want the vm_uffd_ops idea, but not as it is written > right now. > > Thanks, > Liam > > [1]. https://lore.kernel.org/all/e7vr62s73dftijeveyg6lfgivctijz4qcar3teswjbuv6gog3k@4sbpuj35nbbh/ > -- Peter Xu
* Peter Xu <peterx@redhat.com> [250919 10:17]: ... > > > > Can you please point me to which patch has clean up? > > > > > > Patch 4. If you want me to explain every change I touched that is a > > > cleanup, I can go into details. Maybe it's faster if you read them, it's > > > not a huge patch. > > > > I responded here [1]. I actually put a lot of effort into that response > > and took a lot of time to dig into some of this to figure out if it was > > possible, and suggested some ideas. > > > > That was back in July, so the details aren't that fresh anymore. Maybe > > you missed my reply? > > AFAICT, we made it the other way round. My reply is here: > > https://lore.kernel.org/all/aMnAscxj_h42wOAC@x1.local/ Thanks, yes. I missed your reply. > > > > > > If we accepted ->fault() for all these years, I don't see a reason we > > > should reject ->uffd_get_folio(), especially one of the goals is to keep > > > the core mm path clean, per my comment to proposal (a). > > > > I see this argument as saying "there's a hole in our boat so why can't I > > make another?" It's not the direction we have to go to get what we need > > right now, so why are we doing it? Like you said, it can be evaluated > > later if things change.. > > You described ->fault() as "a hole in our boat"? I'm astonished and do not > know what to say on this. > > There was a great comment saying one may want to make Linux an unikernel. > I thought it was a good one, but only when it was a joke. Is it not? Well it's leaking the internals, which is what we don't want to do. Certainly it is useful and does what is needed. > > > > > My thoughts were around an idea that we only really need to do a limited > > number of operations on that pointer you are returning. Those > > operations may share code, and could be internal to mm. I don't see > > this as (a), (b), or (c), but maybe an addition to (b)? Maybe we need > > more ops to cover the uses? > > That's exactly what this proposal is about, isn't it? Userfaultfd minor > fault shares almost all the code except the one hook fetching a folio from > a page cache from the memory type. > > "could be internal to mm" is (c) at least. No one can do what you > mentioned without moving guest-memfd into mm/ first. > > Nikita and I drafted these changes, so likely we may likely have better > idea what is happening. > > Would you perhaps implement your idea, if that's better? Either you're > right, we're happy to use it. Or you found what you're missing. > I spoke to Mike on this and I understand what I was missing. I'm fine with the folio part as you have it. Apologies for holding this up and the added stress on your side. Thanks, Liam
On Fri, Sep 19, 2025 at 03:38:51PM -0400, Liam R. Howlett wrote: > I spoke to Mike on this and I understand what I was missing. I'm fine > with the folio part as you have it. > > Apologies for holding this up and the added stress on your side. Thank you for changing your mind. Mike, I definitely appreciate your help alone the way since the start. I'll wait for 2-3 more days to see whether there's any further objections from anyone, or I'll repost. -- Peter Xu
On Fri, Sep 19, 2025 at 10:16:57AM -0400, Peter Xu wrote: > > You described ->fault() as "a hole in our boat"? I'm astonished and do not > know what to say on this. > > There was a great comment saying one may want to make Linux an unikernel. > I thought it was a good one, but only when it was a joke. Is it not? > > > > > My thoughts were around an idea that we only really need to do a limited > > number of operations on that pointer you are returning. Those > > operations may share code, and could be internal to mm. I don't see > > this as (a), (b), or (c), but maybe an addition to (b)? Maybe we need > > more ops to cover the uses? > > That's exactly what this proposal is about, isn't it? Userfaultfd minor > fault shares almost all the code except the one hook fetching a folio from > a page cache from the memory type. > > "could be internal to mm" is (c) at least. No one can do what you > mentioned without moving guest-memfd into mm/ first. > > Nikita and I drafted these changes, so likely we may likely have better > idea what is happening. > > Would you perhaps implement your idea, if that's better? Either you're > right, we're happy to use it. Or you found what you're missing. > > It's not required, but now I think it might be a good idea if you are so > strongly pushing back this userfaultfd feature. I hope it's fair we > request for a better solution from you when you rejected almost every > solution. Peter - I've been staying out of this discussion as I'm about to go to Kernel Recipes and then off on a (well-needed!) holiday, and I simply lack the bandwidth right now. But I think we should all calm down a little here :) Liam and I (more so Liam recently for above reasons) have pushed back because we have both personally experienced the consequences of giving drivers too much flexibility wrt core mm functionality. This is the sole reason we have done so. We are both eager to find a way forward that is constructive and works well for everybody involved. We WANT this series to land. So I think perhaps we should take a step back and identify clearly what the issues are and how we might best address them. I spoke to Mike off-list who suggested perhaps things aren't quite egregious as they seem with uffd_get_folio() so perhaps this is a means of moving forward. But I think in broad terms - let's identify what the sensible options are, and then drill down into whichever one we agree is best to move forwards with. Again, apologies for not being able to be more involved here, workload/other engagements dictate that I am unable to be. Cheers, Lorenzo
On Fri, Sep 19, 2025 at 03:34:39PM +0100, Lorenzo Stoakes wrote: > Peter - > > I've been staying out of this discussion as I'm about to go to Kernel > Recipes and then off on a (well-needed!) holiday, and I simply lack the > bandwidth right now. > > But I think we should all calm down a little here :) > > Liam and I (more so Liam recently for above reasons) have pushed back > because we have both personally experienced the consequences of giving > drivers too much flexibility wrt core mm functionality. > > This is the sole reason we have done so. > > We are both eager to find a way forward that is constructive and works well > for everybody involved. We WANT this series to land. > > So I think perhaps we should take a step back and identify clearly what the > issues are and how we might best address them. > > I spoke to Mike off-list who suggested perhaps things aren't quite > egregious as they seem with uffd_get_folio() so perhaps this is a means of > moving forward. > > But I think in broad terms - let's identify what the sensible options are, > and then drill down into whichever one we agree is best to move forwards > with. > > Again, apologies for not being able to be more involved here, > workload/other engagements dictate that I am unable to be. That's totally fine, Lorenzo. I appreciate your help on figuring things out. I do agree the discussion actually went nowhere. I think so far the "issues" is very much clear, about exporting uffd_get_folio(), as you correctly pointed out and I'm glad you discussed with Mike. My point is that hook is totally fine, and we need that exactly because we want to keep ->fault() semantic clean. Just to mention, if this series cannot land, I prefer landing Nikita's very old version (a). That'll make mm fault() ugly, I pointed that out, but if all the people prefer that and all the people like to sign-off with it, I'm OK from userfaultfd perspective. I don't make judgement there. Then this series can drop uffd_get_folio() and keep the rest in one way or another, describing memory type attributes only, and need to cooperate only a driver with a ->fault() that works for the new flag. But then this series will be a pure cleanup. I'll likely then put this series aside as it stops blocking things, and I also have a queue to flush myself elsewhere. I wished we can just go with this series with uffd_get_folio() only. Feel free to discuss with more people, and let me know how this series should move on. Thanks a lot, -- Peter Xu
On Mon, Sep 08, 2025 at 12:53:37PM -0400, Liam R. Howlett wrote: > What we don't want is non-mm code specifying values for the function > pointer and doing what they want, or a function pointer that returns a > core mm resource (in the old example this was a vma, here it is a > folio). > > From this patch set: > + * Return: zero if succeeded, negative for errors. > + */ > + int (*uffd_get_folio)(struct inode *inode, pgoff_t pgoff, > + struct folio **folio); > > This is one of the contention points in the current scenario as the > folio would be returned. OK I didn't see this one previously, it partly answers one of my question in the other reply, in a way I wished not. Could you elaborate why an API returning an folio pointer would be dangerous? OTOH, would you think alloc_pages() or folio_alloc() be dangerous too? They return a folio from the mm core to drivers, hence it's not the same direction of folio sharing, however it also means at least the driver can manipulate the folio / memmap as much as it wants, sabotaging everything is similarly possible. Why we worry about that? Are we going to unexport alloc_pages() someday? Thanks, -- Peter Xu
* Peter Xu <peterx@redhat.com> [250916 16:05]: > On Mon, Sep 08, 2025 at 12:53:37PM -0400, Liam R. Howlett wrote: > > What we don't want is non-mm code specifying values for the function > > pointer and doing what they want, or a function pointer that returns a > > core mm resource (in the old example this was a vma, here it is a > > folio). > > > > From this patch set: > > + * Return: zero if succeeded, negative for errors. > > + */ > > + int (*uffd_get_folio)(struct inode *inode, pgoff_t pgoff, > > + struct folio **folio); > > > > This is one of the contention points in the current scenario as the > > folio would be returned. > > OK I didn't see this one previously, it partly answers one of my question > in the other reply, in a way I wished not. > > Could you elaborate why an API returning an folio pointer would be > dangerous? I did [1], and Lorenzo did [2]. This is a bad design that has gotten us in trouble and we don't need to do it. This is an anti-pattern. > > OTOH, would you think alloc_pages() or folio_alloc() be dangerous too? > > They return a folio from the mm core to drivers, hence it's not the same > direction of folio sharing, however it also means at least the driver can > manipulate the folio / memmap as much as it wants, sabotaging everything is > similarly possible. Why we worry about that? Are you expecting the same number of memory types as drivers? I'm not sure I want to live in that reality. > > Are we going to unexport alloc_pages() someday? I guess, over a long enough timeline all functions will be unexported. And this is exactly why a 'two phase' approach to 'revisit if necessary' [3] is a problem. When we tried to remove the use of mm pointers in drivers, we were stuck looking at hundreds of lines of code in a single driver trying to figure out what was going on. Seriously, I added locking and it added a regression so they removed it [4]. It took years to get that driver to a more sensible state, and I'm really happy that Carlos Llamas did all that work! We regularly had to fight with people to stop caching a pointer internally. You know why they needed the vma pointer? To modify the vma flag, but only a few modifications were supposed to be made.. yet it spawned years of cleanup. And you're asking us to do it again. Why can't we use enums to figure out what to do [5], one of which could be the new functionality for guest_memfd? There are many ways that this can be done with limited code living in the mm that are safer and more maintainable and testable than handing out pointers that have locking hell [6] to anyone with a source file. Thanks, Liam [1]. https://lore.kernel.org/linux-mm/5ixvg4tnwj53ixh2fx26dxlctgqtayydqryqxhns6bwj3q3ies@6sttjti5dxt7/ [2]. https://lore.kernel.org/linux-mm/982f4f94-f0bf-45dd-9003-081b76e57027@lucifer.local/ [3]. https://lore.kernel.org/linux-mm/e9235b88-e2be-4358-a6fb-507c5cad6fd9@lucifer.local/ [4]. https://lore.kernel.org/all/20220621140212.vpkio64idahetbyf@revolver/T/#m9d9c8911447e395a73448700d7f06a4366b5ae02 [5]. https://lore.kernel.org/linux-mm/54bb09fc-144b-4d61-8bc2-1eca4d278382@lucifer.local/ [6]. https://elixir.bootlin.com/linux/v6.16.7/source/mm/rmap.c#L21
* Suren Baghdasaryan <surenb@google.com> [250701 13:04]: ... > > > + /** > > > + * uffd_copy: Handler to resolve UFFDIO_COPY|ZEROPAGE request. > > > + * > > > + * @dst_pmd: target pmd to resolve page fault > > > + * @dst_vma: target vma > > > + * @dst_addr: target virtual address > > > + * @src_addr: source address to copy from > > > + * @flags: userfaultfd request flags > > > + * @foliop: previously allocated folio > > > + * > > > + * Return: zero if succeeded, negative for errors. > > > > Can you please ensure you put details as to VMA lock state here. Uffd has > > some very tricky handling around stuff like this. > > > > > + */ > > > + int (*uffd_copy)(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, > > > + unsigned long dst_addr, unsigned long src_addr, > > > + uffd_flags_t flags, struct folio **foliop); > > > > Do we not need a uffd_ctx parameter here? > > > > It seems like we're assuming a _lot_ of mm understanding in the underlying > > driver here. > > > > I'm not sure it's really normal to be handing around page table state and > > folios etc. to a driver like this, this is really... worrying to me. > > > > This feels like you're trying to put mm functionality outside of mm? > > To second that, two things stick out for me here: > 1. uffd_copy and uffd_get_folio seem to be at different abstraction > levels. uffd_copy is almost the entire copy operation for VM_SHARED > VMAs while uffd_get_folio is a small part of the continue operation. > 2. shmem_mfill_atomic_pte which becomes uffd_copy for shmem in the > last patch is quite a complex function which itself calls some IMO > pretty internal functions like mfill_atomic_install_pte(). Expecting > modules to implement such functionality seems like a stretch to me Yes. I don't think this is a good idea to expose, since there is no way to restrict it to a specific implementations. We used to pass out a vma to a driver for updating the vma flags, and even that proved to be too permissive and it ended very poorly. We had drivers saving the pointer then dropping the lock, we had drivers changing things under us. Then there was the fallout in the mm for trying to deal with what may have happened - and the failure scenarios of the dealing with it didn't work out. What this is doing is leading down a path to allow such things at an even lower level. I think this is too flexible and opens us up to unintentional abuse. That is to say, I don't think we should expose this outside of mm for the benefit of everyone. > but > maybe this is for some specialized modules which are written by mm > experts only? > Even with 'experts' (do any of us really know what we're doing, anyways? ;) we may get into a situation where a company may write themselves into a corner, depending on specific functionality that's hard to re-implement and a nightmare to keep working. I don't want to bind.. er.. to a specific example, but I believe we can all think of at least one. Is there more information you can share as to why you want to expose this functionality? Maybe we can find another way that does not expose the internals and accomplishes what you want? Thanks, Liam
Hi Peter, On Fri, Jun 27, 2025 at 11:46:52AM -0400, Peter Xu wrote: > Introduce a generic userfaultfd API for vm_operations_struct, so that one > vma, especially when as a module, can support userfaults without modifying > the core files. More importantly, when the module can be compiled out of > the kernel. > > So, instead of having core mm referencing modules that may not ever exist, > we need to have modules opt-in on core mm hooks instead. > > After this API applied, if a module wants to support userfaultfd, the > module should only need to touch its own file and properly define > vm_uffd_ops, instead of changing anything in core mm. I liked the changelog update you proposed in v1 thread. I took liberty to slightly update it and here's what I've got: Currently, most of the userfaultfd features are implemented directly in the core mm. It will invoke VMA specific functions whenever necessary. So far it is fine because it almost only interacts with shmem and hugetlbfs. Introduce a generic userfaultfd API extension for vm_operations_struct, so that any code that implements vm_operations_struct (including kernel modules that can be compiled separately from the kernel core) can support userfaults without modifying the core files. With this API applied, if a module wants to support userfaultfd, the module should only need to properly define vm_uffd_ops and hook it to vm_operations_struct, instead of changing anything in core mm. > Note that such API will not work for anonymous. Core mm will process > anonymous memory separately for userfault operations like before. Maybe: This API will not work for anonymous memory. Handling of userfault operations for anonymous memory remains unchanged in core mm. > This patch only introduces the API alone so that we can start to move > existing users over but without breaking them. Please use imperative mood, e.g. Only introduce the new API so that ... > Currently the uffd_copy() API is almost designed to be the simplistic with > minimum mm changes to move over to the API. > > Signed-off-by: Peter Xu <peterx@redhat.com> > --- > include/linux/mm.h | 9 ++++++ > include/linux/userfaultfd_k.h | 52 +++++++++++++++++++++++++++++++++++ > 2 files changed, 61 insertions(+) > > diff --git a/include/linux/mm.h b/include/linux/mm.h > index ef40f68c1183..6a5447bd43fd 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -576,6 +576,8 @@ struct vm_fault { > */ > }; > > +struct vm_uffd_ops; > + > /* > * These are the virtual MM functions - opening of an area, closing and > * unmapping it (needed to keep files on disk up-to-date etc), pointer > @@ -653,6 +655,13 @@ struct vm_operations_struct { > */ > struct page *(*find_special_page)(struct vm_area_struct *vma, > unsigned long addr); > +#ifdef CONFIG_USERFAULTFD > + /* > + * Userfaultfd related ops. Modules need to define this to support > + * userfaultfd. > + */ > + const struct vm_uffd_ops *userfaultfd_ops; > +#endif > }; > > #ifdef CONFIG_NUMA_BALANCING > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h > index df85330bcfa6..c9a093c4502b 100644 > --- a/include/linux/userfaultfd_k.h > +++ b/include/linux/userfaultfd_k.h > @@ -92,6 +92,58 @@ enum mfill_atomic_mode { > NR_MFILL_ATOMIC_MODES, > }; > > +/* VMA userfaultfd operations */ > +struct vm_uffd_ops { > + /** > + * @uffd_features: features supported in bitmask. > + * > + * When the ops is defined, the driver must set non-zero features > + * to be a subset (or all) of: VM_UFFD_MISSING|WP|MINOR. > + */ > + unsigned long uffd_features; > + /** > + * @uffd_ioctls: ioctls supported in bitmask. > + * > + * Userfaultfd ioctls supported by the module. Below will always > + * be supported by default whenever a module provides vm_uffd_ops: > + * > + * _UFFDIO_API, _UFFDIO_REGISTER, _UFFDIO_UNREGISTER, _UFFDIO_WAKE > + * > + * The module needs to provide all the rest optionally supported > + * ioctls. For example, when VM_UFFD_MISSING was supported, > + * _UFFDIO_COPY must be supported as ioctl, while _UFFDIO_ZEROPAGE > + * is optional. > + */ > + unsigned long uffd_ioctls; > + /** > + * uffd_get_folio: Handler to resolve UFFDIO_CONTINUE request. > + * > + * @inode: the inode for folio lookup > + * @pgoff: the pgoff of the folio > + * @folio: returned folio pointer > + * > + * Return: zero if succeeded, negative for errors. > + */ > + int (*uffd_get_folio)(struct inode *inode, pgoff_t pgoff, > + struct folio **folio); > + /** > + * uffd_copy: Handler to resolve UFFDIO_COPY|ZEROPAGE request. > + * > + * @dst_pmd: target pmd to resolve page fault > + * @dst_vma: target vma > + * @dst_addr: target virtual address > + * @src_addr: source address to copy from > + * @flags: userfaultfd request flags > + * @foliop: previously allocated folio > + * > + * Return: zero if succeeded, negative for errors. > + */ > + int (*uffd_copy)(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, > + unsigned long dst_addr, unsigned long src_addr, > + uffd_flags_t flags, struct folio **foliop); > +}; > +typedef struct vm_uffd_ops vm_uffd_ops; Either use vm_uffd_ops_t for the typedef or drop the typedef entirely. My preference is for the second option. > + > #define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1) > #define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr)) > #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr)) > -- > 2.49.0 > -- Sincerely yours, Mike.
On Sun, Jun 29, 2025 at 11:50:11AM +0300, Mike Rapoport wrote: > Hi Peter, Hi, Mike, > > On Fri, Jun 27, 2025 at 11:46:52AM -0400, Peter Xu wrote: > > Introduce a generic userfaultfd API for vm_operations_struct, so that one > > vma, especially when as a module, can support userfaults without modifying > > the core files. More importantly, when the module can be compiled out of > > the kernel. > > > > So, instead of having core mm referencing modules that may not ever exist, > > we need to have modules opt-in on core mm hooks instead. > > > > After this API applied, if a module wants to support userfaultfd, the > > module should only need to touch its own file and properly define > > vm_uffd_ops, instead of changing anything in core mm. > > I liked the changelog update you proposed in v1 thread. I took liberty to It's definitely hard to satisfy all reviewers on one version of commit message.. > slightly update it and here's what I've got: > > Currently, most of the userfaultfd features are implemented directly in the > core mm. It will invoke VMA specific functions whenever necessary. So far > it is fine because it almost only interacts with shmem and hugetlbfs. > > Introduce a generic userfaultfd API extension for vm_operations_struct, > so that any code that implements vm_operations_struct (including kernel > modules that can be compiled separately from the kernel core) can support > userfaults without modifying the core files. > > With this API applied, if a module wants to support userfaultfd, the > module should only need to properly define vm_uffd_ops and hook it to > vm_operations_struct, instead of changing anything in core mm. Thanks, I very much appreciate explicit suggestions on the wordings. Personally I like it and the rest suggestions, I'll use it when repost, but I'll also wait for others if anyone has other things to say. > > > Note that such API will not work for anonymous. Core mm will process > > anonymous memory separately for userfault operations like before. > > Maybe: > > This API will not work for anonymous memory. Handling of userfault > operations for anonymous memory remains unchanged in core mm. > > > This patch only introduces the API alone so that we can start to move > > existing users over but without breaking them. > > Please use imperative mood, e.g. > > Only introduce the new API so that ... > > > Currently the uffd_copy() API is almost designed to be the simplistic with > > minimum mm changes to move over to the API. > > > > Signed-off-by: Peter Xu <peterx@redhat.com> > > --- > > include/linux/mm.h | 9 ++++++ > > include/linux/userfaultfd_k.h | 52 +++++++++++++++++++++++++++++++++++ > > 2 files changed, 61 insertions(+) > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index ef40f68c1183..6a5447bd43fd 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -576,6 +576,8 @@ struct vm_fault { > > */ > > }; > > > > +struct vm_uffd_ops; > > + > > /* > > * These are the virtual MM functions - opening of an area, closing and > > * unmapping it (needed to keep files on disk up-to-date etc), pointer > > @@ -653,6 +655,13 @@ struct vm_operations_struct { > > */ > > struct page *(*find_special_page)(struct vm_area_struct *vma, > > unsigned long addr); > > +#ifdef CONFIG_USERFAULTFD > > + /* > > + * Userfaultfd related ops. Modules need to define this to support > > + * userfaultfd. > > + */ > > + const struct vm_uffd_ops *userfaultfd_ops; > > +#endif > > }; > > > > #ifdef CONFIG_NUMA_BALANCING > > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h > > index df85330bcfa6..c9a093c4502b 100644 > > --- a/include/linux/userfaultfd_k.h > > +++ b/include/linux/userfaultfd_k.h > > @@ -92,6 +92,58 @@ enum mfill_atomic_mode { > > NR_MFILL_ATOMIC_MODES, > > }; > > > > +/* VMA userfaultfd operations */ > > +struct vm_uffd_ops { > > + /** > > + * @uffd_features: features supported in bitmask. > > + * > > + * When the ops is defined, the driver must set non-zero features > > + * to be a subset (or all) of: VM_UFFD_MISSING|WP|MINOR. > > + */ > > + unsigned long uffd_features; > > + /** > > + * @uffd_ioctls: ioctls supported in bitmask. > > + * > > + * Userfaultfd ioctls supported by the module. Below will always > > + * be supported by default whenever a module provides vm_uffd_ops: > > + * > > + * _UFFDIO_API, _UFFDIO_REGISTER, _UFFDIO_UNREGISTER, _UFFDIO_WAKE > > + * > > + * The module needs to provide all the rest optionally supported > > + * ioctls. For example, when VM_UFFD_MISSING was supported, > > + * _UFFDIO_COPY must be supported as ioctl, while _UFFDIO_ZEROPAGE > > + * is optional. > > + */ > > + unsigned long uffd_ioctls; > > + /** > > + * uffd_get_folio: Handler to resolve UFFDIO_CONTINUE request. > > + * > > + * @inode: the inode for folio lookup > > + * @pgoff: the pgoff of the folio > > + * @folio: returned folio pointer > > + * > > + * Return: zero if succeeded, negative for errors. > > + */ > > + int (*uffd_get_folio)(struct inode *inode, pgoff_t pgoff, > > + struct folio **folio); > > + /** > > + * uffd_copy: Handler to resolve UFFDIO_COPY|ZEROPAGE request. > > + * > > + * @dst_pmd: target pmd to resolve page fault > > + * @dst_vma: target vma > > + * @dst_addr: target virtual address > > + * @src_addr: source address to copy from > > + * @flags: userfaultfd request flags > > + * @foliop: previously allocated folio > > + * > > + * Return: zero if succeeded, negative for errors. > > + */ > > + int (*uffd_copy)(pmd_t *dst_pmd, struct vm_area_struct *dst_vma, > > + unsigned long dst_addr, unsigned long src_addr, > > + uffd_flags_t flags, struct folio **foliop); > > +}; > > +typedef struct vm_uffd_ops vm_uffd_ops; > > Either use vm_uffd_ops_t for the typedef or drop the typedef entirely. My > preference is for the second option. Andrew helped me to fix some hidden spaces which I appreciated, then I found checkpatch warns on this one too besides the spaces fixed in mm-new. I do not know why checkpatch doesn't like typedefs even if typedefs are massively used in Linux.. I think I'll simply stick with not using typedefs. Thanks, > > > + > > #define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1) > > #define MFILL_ATOMIC_BIT(nr) BIT(MFILL_ATOMIC_MODE_BITS + (nr)) > > #define MFILL_ATOMIC_FLAG(nr) ((__force uffd_flags_t) MFILL_ATOMIC_BIT(nr)) > > -- > > 2.49.0 > > > > -- > Sincerely yours, > Mike. > -- Peter Xu
© 2016 - 2025 Red Hat, Inc.