From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
userfaultfd notifications about minor page faults used for live migration
and snapshotting of VMs with memory backed by shared hugetlbfs or tmpfs
mappings as described in detail in commit 7677f7fd8be7 ("userfaultfd: add
minor fault registration mode").
To use the same mechanism for VMs that use guest_memfd to map their memory,
guest_memfd should support userfaultfd minor mode.
Extend ->fault() method of guest_memfd with ability to notify core page
fault handler that a page fault requires handle_userfault(VM_UFFD_MINOR) to
complete and add implementation of ->get_folio_noalloc() to guest_memfd
vm_ops.
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
virt/kvm/guest_memfd.c | 33 ++++++++++++++++++++++++++++++++-
1 file changed, 32 insertions(+), 1 deletion(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index ffadc5ee8e04..dca6e373937b 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
#include <linux/kvm_host.h>
#include <linux/pagemap.h>
#include <linux/anon_inodes.h>
+#include <linux/userfaultfd_k.h>
#include "kvm_mm.h"
@@ -359,7 +360,15 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
if (!((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED))
return VM_FAULT_SIGBUS;
- folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+ folio = filemap_lock_folio(inode->i_mapping, vmf->pgoff);
+ if (!IS_ERR_OR_NULL(folio) && userfaultfd_minor(vmf->vma)) {
+ ret = VM_FAULT_UFFD_MINOR;
+ goto out_folio;
+ }
+
+ if (PTR_ERR(folio) == -ENOENT)
+ folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+
if (IS_ERR(folio)) {
int err = PTR_ERR(folio);
@@ -390,8 +399,30 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
return ret;
}
+#ifdef CONFIG_USERFAULTFD
+static struct folio *kvm_gmem_get_folio_noalloc(struct inode *inode,
+ pgoff_t pgoff)
+{
+ struct folio *folio;
+
+ folio = filemap_lock_folio(inode->i_mapping, pgoff);
+ if (IS_ERR_OR_NULL(folio))
+ return folio;
+
+ if (!folio_test_uptodate(folio)) {
+ clear_highpage(folio_page(folio, 0));
+ kvm_gmem_mark_prepared(folio);
+ }
+
+ return folio;
+}
+#endif
+
static const struct vm_operations_struct kvm_gmem_vm_ops = {
.fault = kvm_gmem_fault_user_mapping,
+#ifdef CONFIG_USERFAULTFD
+ .get_folio_noalloc = kvm_gmem_get_folio_noalloc,
+#endif
};
static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
--
2.51.0
On 30/11/2025 11:18, Mike Rapoport wrote:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>
> userfaultfd notifications about minor page faults used for live migration
> and snapshotting of VMs with memory backed by shared hugetlbfs or tmpfs
> mappings as described in detail in commit 7677f7fd8be7 ("userfaultfd: add
> minor fault registration mode").
>
> To use the same mechanism for VMs that use guest_memfd to map their memory,
> guest_memfd should support userfaultfd minor mode.
>
> Extend ->fault() method of guest_memfd with ability to notify core page
> fault handler that a page fault requires handle_userfault(VM_UFFD_MINOR) to
> complete and add implementation of ->get_folio_noalloc() to guest_memfd
> vm_ops.
>
> Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---
> virt/kvm/guest_memfd.c | 33 ++++++++++++++++++++++++++++++++-
> 1 file changed, 32 insertions(+), 1 deletion(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index ffadc5ee8e04..dca6e373937b 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -4,6 +4,7 @@
> #include <linux/kvm_host.h>
> #include <linux/pagemap.h>
> #include <linux/anon_inodes.h>
> +#include <linux/userfaultfd_k.h>
>
> #include "kvm_mm.h"
>
> @@ -359,7 +360,15 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
> if (!((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED))
> return VM_FAULT_SIGBUS;
>
> - folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> + folio = filemap_lock_folio(inode->i_mapping, vmf->pgoff);
> + if (!IS_ERR_OR_NULL(folio) && userfaultfd_minor(vmf->vma)) {
> + ret = VM_FAULT_UFFD_MINOR;
> + goto out_folio;
> + }
I realised that I might have been wrong in [1] saying that the noalloc
get folio was ok for our use case. Unfortunately we rely on a minor
fault to get generated even when the page is being allocated. Peter and
I discussed it originally in [2]. Since we want to populate guest
memory with the content supplied by userspace on demand, we have to be
able to intercept the very first access, meaning we either need a minor
or major UFFD event for that. We decided to make use of the minor at
the time. If we have to preserve the shmem semantics, it forces us to
implement support for major/UFFDIO_COPY.
[1]
https://lore.kernel.org/all/4405c306-9d7c-4fd6-9ea6-2ed1b73f5c2e@amazon.com
[2] https://lore.kernel.org/kvm/Z9HhTjEWtM58Zfxf@x1.local
> +
> + if (PTR_ERR(folio) == -ENOENT)
> + folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> +
> if (IS_ERR(folio)) {
> int err = PTR_ERR(folio);
>
> @@ -390,8 +399,30 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
> return ret;
> }
>
> +#ifdef CONFIG_USERFAULTFD
> +static struct folio *kvm_gmem_get_folio_noalloc(struct inode *inode,
> + pgoff_t pgoff)
> +{
> + struct folio *folio;
> +
> + folio = filemap_lock_folio(inode->i_mapping, pgoff);
> + if (IS_ERR_OR_NULL(folio))
> + return folio;
> +
> + if (!folio_test_uptodate(folio)) {
> + clear_highpage(folio_page(folio, 0));
> + kvm_gmem_mark_prepared(folio);
> + }
> +
> + return folio;
> +}
> +#endif
> +
> static const struct vm_operations_struct kvm_gmem_vm_ops = {
> .fault = kvm_gmem_fault_user_mapping,
> +#ifdef CONFIG_USERFAULTFD
> + .get_folio_noalloc = kvm_gmem_get_folio_noalloc,
> +#endif
> };
>
> static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
> --
> 2.51.0
>
On 12/1/25 14:39, Nikita Kalyazin wrote:
>
>
> On 30/11/2025 11:18, Mike Rapoport wrote:
>> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>>
>> userfaultfd notifications about minor page faults used for live migration
>> and snapshotting of VMs with memory backed by shared hugetlbfs or tmpfs
>> mappings as described in detail in commit 7677f7fd8be7 ("userfaultfd: add
>> minor fault registration mode").
>>
>> To use the same mechanism for VMs that use guest_memfd to map their memory,
>> guest_memfd should support userfaultfd minor mode.
>>
>> Extend ->fault() method of guest_memfd with ability to notify core page
>> fault handler that a page fault requires handle_userfault(VM_UFFD_MINOR) to
>> complete and add implementation of ->get_folio_noalloc() to guest_memfd
>> vm_ops.
>>
>> Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
>> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
>> ---
>> virt/kvm/guest_memfd.c | 33 ++++++++++++++++++++++++++++++++-
>> 1 file changed, 32 insertions(+), 1 deletion(-)
>>
>> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>> index ffadc5ee8e04..dca6e373937b 100644
>> --- a/virt/kvm/guest_memfd.c
>> +++ b/virt/kvm/guest_memfd.c
>> @@ -4,6 +4,7 @@
>> #include <linux/kvm_host.h>
>> #include <linux/pagemap.h>
>> #include <linux/anon_inodes.h>
>> +#include <linux/userfaultfd_k.h>
>>
>> #include "kvm_mm.h"
>>
>> @@ -359,7 +360,15 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
>> if (!((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED))
>> return VM_FAULT_SIGBUS;
>>
>> - folio = kvm_gmem_get_folio(inode, vmf->pgoff);
>> + folio = filemap_lock_folio(inode->i_mapping, vmf->pgoff);
>> + if (!IS_ERR_OR_NULL(folio) && userfaultfd_minor(vmf->vma)) {
>> + ret = VM_FAULT_UFFD_MINOR;
>> + goto out_folio;
>> + }
>
> I realised that I might have been wrong in [1] saying that the noalloc
> get folio was ok for our use case. Unfortunately we rely on a minor
> fault to get generated even when the page is being allocated. Peter and
> I discussed it originally in [2]. Since we want to populate guest
> memory with the content supplied by userspace on demand, we have to be
> able to intercept the very first access, meaning we either need a minor
> or major UFFD event for that. We decided to make use of the minor at
> the time. If we have to preserve the shmem semantics, it forces us to
> implement support for major/UFFDIO_COPY.
If we want missing semantics then likely we should be adding ... missing
support? :)
--
Cheers
David
On 01/12/2025 15:54, David Hildenbrand (Red Hat) wrote:
> On 12/1/25 14:39, Nikita Kalyazin wrote:
>>
>>
>> On 30/11/2025 11:18, Mike Rapoport wrote:
>>> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>>>
>>> userfaultfd notifications about minor page faults used for live
>>> migration
>>> and snapshotting of VMs with memory backed by shared hugetlbfs or tmpfs
>>> mappings as described in detail in commit 7677f7fd8be7 ("userfaultfd:
>>> add
>>> minor fault registration mode").
>>>
>>> To use the same mechanism for VMs that use guest_memfd to map their
>>> memory,
>>> guest_memfd should support userfaultfd minor mode.
>>>
>>> Extend ->fault() method of guest_memfd with ability to notify core page
>>> fault handler that a page fault requires
>>> handle_userfault(VM_UFFD_MINOR) to
>>> complete and add implementation of ->get_folio_noalloc() to guest_memfd
>>> vm_ops.
>>>
>>> Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
>>> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
>>> ---
>>> virt/kvm/guest_memfd.c | 33 ++++++++++++++++++++++++++++++++-
>>> 1 file changed, 32 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>>> index ffadc5ee8e04..dca6e373937b 100644
>>> --- a/virt/kvm/guest_memfd.c
>>> +++ b/virt/kvm/guest_memfd.c
>>> @@ -4,6 +4,7 @@
>>> #include <linux/kvm_host.h>
>>> #include <linux/pagemap.h>
>>> #include <linux/anon_inodes.h>
>>> +#include <linux/userfaultfd_k.h>
>>>
>>> #include "kvm_mm.h"
>>>
>>> @@ -359,7 +360,15 @@ static vm_fault_t
>>> kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
>>> if (!((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED))
>>> return VM_FAULT_SIGBUS;
>>>
>>> - folio = kvm_gmem_get_folio(inode, vmf->pgoff);
>>> + folio = filemap_lock_folio(inode->i_mapping, vmf->pgoff);
>>> + if (!IS_ERR_OR_NULL(folio) && userfaultfd_minor(vmf->vma)) {
>>> + ret = VM_FAULT_UFFD_MINOR;
>>> + goto out_folio;
>>> + }
>>
>> I realised that I might have been wrong in [1] saying that the noalloc
>> get folio was ok for our use case. Unfortunately we rely on a minor
>> fault to get generated even when the page is being allocated. Peter and
>> I discussed it originally in [2]. Since we want to populate guest
>> memory with the content supplied by userspace on demand, we have to be
>> able to intercept the very first access, meaning we either need a minor
>> or major UFFD event for that. We decided to make use of the minor at
>> the time. If we have to preserve the shmem semantics, it forces us to
>> implement support for major/UFFDIO_COPY.
>
> If we want missing semantics then likely we should be adding ... missing
> support? :)
I believe I found the precise point where we convinced ourselves that
minor support was sufficient: [1]. If at this moment we don't find that
reasoning valid anymore, then indeed implementing missing is the only
option.
[1] https://lore.kernel.org/kvm/Z9GsIDVYWoV8d8-C@x1.local
>
> --
> Cheers
>
> David
On Mon, Dec 01, 2025 at 04:48:22PM +0000, Nikita Kalyazin wrote: > I believe I found the precise point where we convinced ourselves that minor > support was sufficient: [1]. If at this moment we don't find that reasoning > valid anymore, then indeed implementing missing is the only option. > > [1] https://lore.kernel.org/kvm/Z9GsIDVYWoV8d8-C@x1.local Now after I re-read the discussion, I may have made a wrong statement there, sorry. I could have got slightly confused on when the write() syscall can be involved. I agree if you want to get an event when cache missed with the current uffd definitions and when pre-population is forbidden, then MISSING trap is required. That is, with/without the need of UFFDIO_COPY being available. Do I understand it right that UFFDIO_COPY is not allowed in your case, but only write()? One way that might work this around, is introducing a new UFFD_FEATURE bit allowing the MINOR registration to trap all pgtable faults, which will change the MINOR fault semantics. That'll need some further thoughts, meanwhile we may also want to make sure the old shmem/hugetlbfs semantics are kept (e.g. they should fail MINOR registers if the new feature bit is enabled in the ctx somehow; or support them properly in the codebase). Thanks, -- Peter Xu
On 01/12/2025 18:35, Peter Xu wrote: > On Mon, Dec 01, 2025 at 04:48:22PM +0000, Nikita Kalyazin wrote: >> I believe I found the precise point where we convinced ourselves that minor >> support was sufficient: [1]. If at this moment we don't find that reasoning >> valid anymore, then indeed implementing missing is the only option. >> >> [1] https://lore.kernel.org/kvm/Z9GsIDVYWoV8d8-C@x1.local > > Now after I re-read the discussion, I may have made a wrong statement > there, sorry. I could have got slightly confused on when the write() > syscall can be involved. > > I agree if you want to get an event when cache missed with the current uffd > definitions and when pre-population is forbidden, then MISSING trap is > required. That is, with/without the need of UFFDIO_COPY being available. > > Do I understand it right that UFFDIO_COPY is not allowed in your case, but > only write()? No, UFFDIO_COPY would work perfectly fine. We will still use write() whenever we resolve stage-2 faults as they aren't visible to UFFD. When a userfault occurs at an offset that already has a page in the cache, we will have to keep using UFFDIO_CONTINUE so it looks like both will be required: - user mapping major fault -> UFFDIO_COPY (fills the cache and sets up userspace PT) - user mapping minor fault -> UFFDIO_CONTINUE (only sets up userspace PT) - stage-2 fault -> write() (only fills the cache) > > One way that might work this around, is introducing a new UFFD_FEATURE bit > allowing the MINOR registration to trap all pgtable faults, which will > change the MINOR fault semantics. This would equally work for us. I suppose this MINOR+MAJOR semantics would be more intrusive from the API point of view though. > > That'll need some further thoughts, meanwhile we may also want to make sure > the old shmem/hugetlbfs semantics are kept (e.g. they should fail MINOR > registers if the new feature bit is enabled in the ctx somehow; or support > them properly in the codebase). > > Thanks, > > -- > Peter Xu >
On Mon, Dec 01, 2025 at 08:12:38PM +0000, Nikita Kalyazin wrote: > > > On 01/12/2025 18:35, Peter Xu wrote: > > On Mon, Dec 01, 2025 at 04:48:22PM +0000, Nikita Kalyazin wrote: > > > I believe I found the precise point where we convinced ourselves that minor > > > support was sufficient: [1]. If at this moment we don't find that reasoning > > > valid anymore, then indeed implementing missing is the only option. > > > > > > [1] https://lore.kernel.org/kvm/Z9GsIDVYWoV8d8-C@x1.local > > > > Now after I re-read the discussion, I may have made a wrong statement > > there, sorry. I could have got slightly confused on when the write() > > syscall can be involved. > > > > I agree if you want to get an event when cache missed with the current uffd > > definitions and when pre-population is forbidden, then MISSING trap is > > required. That is, with/without the need of UFFDIO_COPY being available. > > > > Do I understand it right that UFFDIO_COPY is not allowed in your case, but > > only write()? > > No, UFFDIO_COPY would work perfectly fine. We will still use write() > whenever we resolve stage-2 faults as they aren't visible to UFFD. When a > userfault occurs at an offset that already has a page in the cache, we will > have to keep using UFFDIO_CONTINUE so it looks like both will be required: > > - user mapping major fault -> UFFDIO_COPY (fills the cache and sets up > userspace PT) > - user mapping minor fault -> UFFDIO_CONTINUE (only sets up userspace PT) > - stage-2 fault -> write() (only fills the cache) Is stage-2 fault about KVM_MEMORY_EXIT_FLAG_USERFAULT, per James's series? It looks fine indeed, but it looks slightly weird then, as you'll have two ways to populate the page cache. Logically here atomicity is indeed not needed when you trap both MISSING + MINOR. > > > > > One way that might work this around, is introducing a new UFFD_FEATURE bit > > allowing the MINOR registration to trap all pgtable faults, which will > > change the MINOR fault semantics. > > This would equally work for us. I suppose this MINOR+MAJOR semantics would > be more intrusive from the API point of view though. Yes it is, it's just that I don't know whether it'll be harder when you want to completely support UFFDIO_COPY here, per previous discussions. After a 2nd thought, such UFFD_FEATURE is probably not a good design, because it essentially means that feature bit will functionally overlap with what MISSING trap was trying to do, however duplicating that concept in a VMA that was registered as MINOR only. Maybe it's possible instead if we allow a module to support MISSING trap, but without supporting UFFDIO_COPY ioctl. That is, the MISSING events will be properly generated if MISSING traps are supported, however the module needs to provide its own way to resolve it if UFFDIO_COPY ioctl isn't available. Gmem is fine in this case as long as it'll always be registered with both MISSING+MINOR traps, then resolving using write()s would work. Such would be possible when with something like my v3 previously: https://lore.kernel.org/all/20250926211650.525109-1-peterx@redhat.com/#t Then gmem needs to declare VM_UFFD_MISSING + VM_UFFD_MINOR in uffd_features, but _UFFDIO_CONTINUE only (without _UFFDIO_COPY) in uffd_ioctls. Since Mike already took this series over, I'll leave that to you all to decide. -- Peter Xu
On 11/30/25 12:18, Mike Rapoport wrote:
> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>
> userfaultfd notifications about minor page faults used for live migration
> and snapshotting of VMs with memory backed by shared hugetlbfs or tmpfs
> mappings as described in detail in commit 7677f7fd8be7 ("userfaultfd: add
> minor fault registration mode").
>
> To use the same mechanism for VMs that use guest_memfd to map their memory,
> guest_memfd should support userfaultfd minor mode.
>
> Extend ->fault() method of guest_memfd with ability to notify core page
> fault handler that a page fault requires handle_userfault(VM_UFFD_MINOR) to
> complete and add implementation of ->get_folio_noalloc() to guest_memfd
> vm_ops.
>
> Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---
> virt/kvm/guest_memfd.c | 33 ++++++++++++++++++++++++++++++++-
> 1 file changed, 32 insertions(+), 1 deletion(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index ffadc5ee8e04..dca6e373937b 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -4,6 +4,7 @@
> #include <linux/kvm_host.h>
> #include <linux/pagemap.h>
> #include <linux/anon_inodes.h>
> +#include <linux/userfaultfd_k.h>
>
> #include "kvm_mm.h"
>
> @@ -359,7 +360,15 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
> if (!((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED))
> return VM_FAULT_SIGBUS;
>
> - folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> + folio = filemap_lock_folio(inode->i_mapping, vmf->pgoff);
> + if (!IS_ERR_OR_NULL(folio) && userfaultfd_minor(vmf->vma)) {
Can we ever get NULL here?
> + ret = VM_FAULT_UFFD_MINOR;
> + goto out_folio;
> + }
> +
> + if (PTR_ERR(folio) == -ENOENT)
> + folio = kvm_gmem_get_folio(inode, vmf->pgoff);
Was briefly wondering what the performance impact of that two-step
approach is (two lookups in case we have to create it IIUC)
Wouldn't it be better to limit it to the userfaultfd_minor(vmf->vma) case?
if (userfaultfd_minor(vmf->vma)) {
folio = filemap_lock_folio(inode->i_mapping, vmf->pgoff);
if (!IS_ERR(folio)) {
ret = VM_FAULT_UFFD_MINOR;
goto out_folio;
}
} else {
folio = kvm_gmem_get_folio(inode, vmf->pgoff);
}
if (IS_ERR(folio)) {
...
--
Cheers
David
© 2016 - 2025 Red Hat, Inc.