From: Shivank Garg <shivankg@amd.com>
Previously, guest-memfd allocations followed local NUMA node id in absence
of process mempolicy, resulting in arbitrary memory allocation.
Moreover, mbind() couldn't be used by the VMM as guest memory wasn't
mapped into userspace when allocation occurred.
Enable NUMA policy support by implementing vm_ops for guest-memfd mmap
operation. This allows the VMM to map the memory and use mbind() to set the
desired NUMA policy. The policy is stored in the inode structure via
kvm_gmem_inode_info, as memory policy is a property of the memory (struct
inode) itself. The policy is then retrieved via mpol_shared_policy_lookup()
and passed to filemap_grab_folio_mpol() to ensure that allocations follow
the specified memory policy.
This enables the VMM to control guest memory NUMA placement by calling
mbind() on the mapped memory regions, providing fine-grained control over
guest memory allocation across NUMA nodes.
The policy change only affect future allocations and does not migrate
existing memory. This matches mbind(2)'s default behavior which affects
only new allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL
flags, which are not supported for guest_memfd as it is unmovable.
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Shivank Garg <shivankg@amd.com>
Tested-by: Ashish Kalra <ashish.kalra@amd.com>
[sean: document the ABI impact of the ->get_policy() hook]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
virt/kvm/guest_memfd.c | 69 ++++++++++++++++++++++++++++++++++++++++--
1 file changed, 67 insertions(+), 2 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index cc3b25155726..95267c92983b 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/kvm_host.h>
+#include <linux/mempolicy.h>
#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
@@ -27,6 +28,7 @@ struct gmem_file {
};
struct gmem_inode {
+ struct shared_policy policy;
struct inode vfs_inode;
};
@@ -112,6 +114,19 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
return r;
}
+static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
+ pgoff_t index)
+{
+#ifdef CONFIG_NUMA
+ struct mempolicy *mpol;
+
+ mpol = mpol_shared_policy_lookup(&gi->policy, index);
+ return mpol ? mpol : get_task_policy(current);
+#else
+ return NULL;
+#endif
+}
+
/*
* Returns a locked folio on success. The caller is responsible for
* setting the up-to-date flag before the memory is mapped into the guest.
@@ -124,7 +139,25 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
{
/* TODO: Support huge pages. */
- return filemap_grab_folio(inode->i_mapping, index);
+ struct mempolicy *policy;
+ struct folio *folio;
+
+ /*
+ * Fast-path: See if folio is already present in mapping to avoid
+ * policy_lookup.
+ */
+ folio = __filemap_get_folio(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED, 0);
+ if (!IS_ERR(folio))
+ return folio;
+
+ policy = kvm_gmem_get_folio_policy(GMEM_I(inode), index);
+ folio = __filemap_get_folio_mpol(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mapping_gfp_mask(inode->i_mapping), policy);
+ mpol_cond_put(policy);
+
+ return folio;
}
static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
@@ -413,8 +446,38 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
return ret;
}
+#ifdef CONFIG_NUMA
+static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+
+ return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol);
+}
+
+static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
+ unsigned long addr, pgoff_t *pgoff)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+
+ *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
+
+ /*
+ * Note! Directly return whatever the lookup returns, do NOT return
+ * the current task's policy as is done when looking up the policy for
+ * a specific folio. Kernel ABI for get_mempolicy() is to return
+ * MPOL_DEFAULT when there is no defined policy, not whatever the
+ * default policy resolves to.
+ */
+ return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
+}
+#endif /* CONFIG_NUMA */
+
static const struct vm_operations_struct kvm_gmem_vm_ops = {
- .fault = kvm_gmem_fault_user_mapping,
+ .fault = kvm_gmem_fault_user_mapping,
+#ifdef CONFIG_NUMA
+ .get_policy = kvm_gmem_get_policy,
+ .set_policy = kvm_gmem_set_policy,
+#endif
};
static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
@@ -867,11 +930,13 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
if (!gi)
return NULL;
+ mpol_shared_policy_init(&gi->policy, NULL);
return &gi->vfs_inode;
}
static void kvm_gmem_destroy_inode(struct inode *inode)
{
+ mpol_free_shared_policy(&GMEM_I(inode)->policy);
}
static void kvm_gmem_free_inode(struct inode *inode)
--
2.51.0.710.ga91ca5db03-goog
Sean Christopherson <seanjc@google.com> writes:
> From: Shivank Garg <shivankg@amd.com>
>
> Previously, guest-memfd allocations followed local NUMA node id in absence
> of process mempolicy, resulting in arbitrary memory allocation.
> Moreover, mbind() couldn't be used by the VMM as guest memory wasn't
> mapped into userspace when allocation occurred.
>
> Enable NUMA policy support by implementing vm_ops for guest-memfd mmap
> operation. This allows the VMM to map the memory and use mbind() to set the
> desired NUMA policy. The policy is stored in the inode structure via
> kvm_gmem_inode_info, as memory policy is a property of the memory (struct
> inode) itself. The policy is then retrieved via mpol_shared_policy_lookup()
> and passed to filemap_grab_folio_mpol() to ensure that allocations follow
> the specified memory policy.
>
> This enables the VMM to control guest memory NUMA placement by calling
> mbind() on the mapped memory regions, providing fine-grained control over
> guest memory allocation across NUMA nodes.
>
> The policy change only affect future allocations and does not migrate
> existing memory. This matches mbind(2)'s default behavior which affects
> only new allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL
> flags, which are not supported for guest_memfd as it is unmovable.
>
> Suggested-by: David Hildenbrand <david@redhat.com>
> Acked-by: David Hildenbrand <david@redhat.com>
> Acked-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: Shivank Garg <shivankg@amd.com>
> Tested-by: Ashish Kalra <ashish.kalra@amd.com>
> [sean: document the ABI impact of the ->get_policy() hook]
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
> virt/kvm/guest_memfd.c | 69 ++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 67 insertions(+), 2 deletions(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index cc3b25155726..95267c92983b 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -4,6 +4,7 @@
> #include <linux/falloc.h>
> #include <linux/fs.h>
> #include <linux/kvm_host.h>
> +#include <linux/mempolicy.h>
> #include <linux/pseudo_fs.h>
> #include <linux/pagemap.h>
>
> @@ -27,6 +28,7 @@ struct gmem_file {
> };
>
> struct gmem_inode {
> + struct shared_policy policy;
> struct inode vfs_inode;
> };
>
> @@ -112,6 +114,19 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> return r;
> }
>
> +static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
> + pgoff_t index)
How about kvm_gmem_get_index_policy() instead, since the policy is keyed
by index?
> +{
> +#ifdef CONFIG_NUMA
> + struct mempolicy *mpol;
> +
> + mpol = mpol_shared_policy_lookup(&gi->policy, index);
> + return mpol ? mpol : get_task_policy(current);
Should we be returning NULL if no shared policy was defined?
By returning NULL, __filemap_get_folio_mpol() can handle the case where
cpuset_do_page_mem_spread().
If we always return current's task policy, what if the user wants to use
cpuset_do_page_mem_spread()?
> +#else
> + return NULL;
Returning current's task policy in the CONFIG_NUMA case seems to
conflict with returning NULL here.
> +#endif
> +}
> +
> /*
> * Returns a locked folio on success. The caller is responsible for
> * setting the up-to-date flag before the memory is mapped into the guest.
> @@ -124,7 +139,25 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
> {
> /* TODO: Support huge pages. */
> - return filemap_grab_folio(inode->i_mapping, index);
> + struct mempolicy *policy;
> + struct folio *folio;
> +
> + /*
> + * Fast-path: See if folio is already present in mapping to avoid
> + * policy_lookup.
> + */
> + folio = __filemap_get_folio(inode->i_mapping, index,
> + FGP_LOCK | FGP_ACCESSED, 0);
> + if (!IS_ERR(folio))
> + return folio;
> +
> + policy = kvm_gmem_get_folio_policy(GMEM_I(inode), index);
If we're going to return NULL if no shared policy is defined, then I
believe we can directly call mpol_shared_policy_lookup() here.
> + folio = __filemap_get_folio_mpol(inode->i_mapping, index,
> + FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
> + mapping_gfp_mask(inode->i_mapping), policy);
> + mpol_cond_put(policy);
> +
> + return folio;
> }
>
> static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
> @@ -413,8 +446,38 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
> return ret;
> }
>
> +#ifdef CONFIG_NUMA
> +static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
> +{
> + struct inode *inode = file_inode(vma->vm_file);
> +
> + return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol);
> +}
> +
> +static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> + unsigned long addr, pgoff_t *pgoff)
> +{
> + struct inode *inode = file_inode(vma->vm_file);
> +
> + *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
> +
> + /*
> + * Note! Directly return whatever the lookup returns, do NOT return
> + * the current task's policy as is done when looking up the policy for
> + * a specific folio. Kernel ABI for get_mempolicy() is to return
> + * MPOL_DEFAULT when there is no defined policy, not whatever the
> + * default policy resolves to.
To be more accurate, I think this sentence should be:
Kernel ABI for .get_policy is to return NULL if no policy is specified
at this index. The caller can then replace NULL with the default memory
policy instead of current's memory policy.
> + */
> + return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
> +}
> +#endif /* CONFIG_NUMA */
> +
> static const struct vm_operations_struct kvm_gmem_vm_ops = {
> - .fault = kvm_gmem_fault_user_mapping,
> + .fault = kvm_gmem_fault_user_mapping,
> +#ifdef CONFIG_NUMA
> + .get_policy = kvm_gmem_get_policy,
> + .set_policy = kvm_gmem_set_policy,
> +#endif
> };
>
> static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
> @@ -867,11 +930,13 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
> if (!gi)
> return NULL;
>
> + mpol_shared_policy_init(&gi->policy, NULL);
> return &gi->vfs_inode;
> }
>
> static void kvm_gmem_destroy_inode(struct inode *inode)
> {
> + mpol_free_shared_policy(&GMEM_I(inode)->policy);
> }
>
> static void kvm_gmem_free_inode(struct inode *inode)
> --
> 2.51.0.710.ga91ca5db03-goog
>>
>> @@ -112,6 +114,19 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>> return r;
>> }
>>
>> +static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
>> + pgoff_t index)
>
> How about kvm_gmem_get_index_policy() instead, since the policy is keyed
> by index?
>
>> +{
>> +#ifdef CONFIG_NUMA
>> + struct mempolicy *mpol;
>> +
>> + mpol = mpol_shared_policy_lookup(&gi->policy, index);
>> + return mpol ? mpol : get_task_policy(current);
>
> Should we be returning NULL if no shared policy was defined?
>
> By returning NULL, __filemap_get_folio_mpol() can handle the case where
> cpuset_do_page_mem_spread().
>
> If we always return current's task policy, what if the user wants to use
> cpuset_do_page_mem_spread()?
>
I initially followed shmem's approach here.
I agree that returning NULL maintains consistency with the current default
behavior of cpuset_do_page_mem_spread(), regardless of CONFIG_NUMA.
I'm curious what could be the practical implications of cpuset_do_page_mem_spread()
v/s get_task_policy() as the fallback?
Which is more appropriate for guest_memfd when no policy is explicitly set via mbind()?
On Fri, Oct 10, 2025, Shivank Garg wrote:
> >> @@ -112,6 +114,19 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> >> return r;
> >> }
> >>
> >> +static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
> >> + pgoff_t index)
> >
> > How about kvm_gmem_get_index_policy() instead, since the policy is keyed
> > by index?
But isn't the policy tied to the folio? I assume/hope that something will split
folios if they have different policies for their indices when a folio contains
more than one page. In other words, how will this work when hugepage support
comes along?
So yeah, I agree that the lookup is keyed on the index, but conceptually aren't
we getting the policy for the folio? The index is a means to an end.
> >> +{
> >> +#ifdef CONFIG_NUMA
> >> + struct mempolicy *mpol;
> >> +
> >> + mpol = mpol_shared_policy_lookup(&gi->policy, index);
> >> + return mpol ? mpol : get_task_policy(current);
> >
> > Should we be returning NULL if no shared policy was defined?
> >
> > By returning NULL, __filemap_get_folio_mpol() can handle the case where
> > cpuset_do_page_mem_spread().
> >
> > If we always return current's task policy, what if the user wants to use
> > cpuset_do_page_mem_spread()?
> >
>
> I initially followed shmem's approach here.
> I agree that returning NULL maintains consistency with the current default
> behavior of cpuset_do_page_mem_spread(), regardless of CONFIG_NUMA.
>
> I'm curious what could be the practical implications of cpuset_do_page_mem_spread()
> v/s get_task_policy() as the fallback?
Userspace could enable page spreading on the task that triggers guest_memfd
allocation. I can't conjure up a reason to do that, but I've been surprised
more than once by KVM setups.
> Which is more appropriate for guest_memfd when no policy is explicitly set
> via mbind()?
I don't think we need to answer that question? Userspace _has_ set a policy,
just through cpuset, not via mbind(). So while I can't imagine there's a sane
use case for cpuset_do_page_mem_spread() with guest_memfd, I also don't see a
reason why KVM should effectively disallow it.
And unless I'm missing something, allocation will eventually fallback to
get_task_policy() (in alloc_frozen_pages_noprof()), so by explicitly getting the
task policy in guest_memfd, KVM is doing _more_ work than necessary _and_ is
unnecessarily restricting usersepace.
Add in that returning NULL would align this code with the ->get_policy hook (and
could be shared again, I assume), and my vote is definitely to return NULL and
not get in the way.
Sean Christopherson <seanjc@google.com> writes:
> On Fri, Oct 10, 2025, Shivank Garg wrote:
>> >> @@ -112,6 +114,19 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>> >> return r;
>> >> }
>> >>
>> >> +static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
>> >> + pgoff_t index)
>> >
>> > How about kvm_gmem_get_index_policy() instead, since the policy is keyed
>> > by index?
>
> But isn't the policy tied to the folio? I assume/hope that something will split
> folios if they have different policies for their indices when a folio contains
> more than one page. In other words, how will this work when hugepage support
> comes along?
>
> So yeah, I agree that the lookup is keyed on the index, but conceptually aren't
> we getting the policy for the folio? The index is a means to an end.
>
I think the policy is tied to the index.
When we mmap(), there may not be a folio at this index yet, so any folio
that gets allocated for this index then is taken from the right NUMA
node based on the policy.
If the folio is later truncated, the folio just goes back to the NUMA
node, but the memory policy remains for the next folio to be allocated
at this index.
>> >> +{
>> >> +#ifdef CONFIG_NUMA
>> >> + struct mempolicy *mpol;
>> >> +
>> >> + mpol = mpol_shared_policy_lookup(&gi->policy, index);
>> >> + return mpol ? mpol : get_task_policy(current);
>> >
>> > Should we be returning NULL if no shared policy was defined?
>> >
>> > By returning NULL, __filemap_get_folio_mpol() can handle the case where
>> > cpuset_do_page_mem_spread().
>> >
>> > If we always return current's task policy, what if the user wants to use
>> > cpuset_do_page_mem_spread()?
>> >
>>
>> I initially followed shmem's approach here.
>> I agree that returning NULL maintains consistency with the current default
>> behavior of cpuset_do_page_mem_spread(), regardless of CONFIG_NUMA.
>>
>> I'm curious what could be the practical implications of cpuset_do_page_mem_spread()
>> v/s get_task_policy() as the fallback?
>
> Userspace could enable page spreading on the task that triggers guest_memfd
> allocation. I can't conjure up a reason to do that, but I've been surprised
> more than once by KVM setups.
>
>> Which is more appropriate for guest_memfd when no policy is explicitly set
>> via mbind()?
>
> I don't think we need to answer that question? Userspace _has_ set a policy,
> just through cpuset, not via mbind(). So while I can't imagine there's a sane
> use case for cpuset_do_page_mem_spread() with guest_memfd, I also don't see a
> reason why KVM should effectively disallow it.
>
> And unless I'm missing something, allocation will eventually fallback to
> get_task_policy() (in alloc_frozen_pages_noprof()), so by explicitly getting the
> task policy in guest_memfd, KVM is doing _more_ work than necessary _and_ is
> unnecessarily restricting usersepace.
>
> Add in that returning NULL would align this code with the ->get_policy hook (and
> could be shared again, I assume), and my vote is definitely to return NULL and
> not get in the way.
... although if we are going to return NULL then we can directly use
mpol_shared_policy_lookup(), so the first discussion is moot.
Though looking slightly into the future, shareability (aka memory
attributes or shared/private state within guest_memfd inodes) are also
keyed by index, and is a property of the index and not the folio (since
shared/private state is defined even before folios are allocated for a
given index.
On Fri, Oct 10, 2025, Ackerley Tng wrote:
> Sean Christopherson <seanjc@google.com> writes:
>
> > On Fri, Oct 10, 2025, Shivank Garg wrote:
> >> >> @@ -112,6 +114,19 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> >> >> return r;
> >> >> }
> >> >>
> >> >> +static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
> >> >> + pgoff_t index)
> >> >
> >> > How about kvm_gmem_get_index_policy() instead, since the policy is keyed
> >> > by index?
> >
> > But isn't the policy tied to the folio? I assume/hope that something will split
> > folios if they have different policies for their indices when a folio contains
> > more than one page. In other words, how will this work when hugepage support
> > comes along?
> >
> > So yeah, I agree that the lookup is keyed on the index, but conceptually aren't
> > we getting the policy for the folio? The index is a means to an end.
> >
>
> I think the policy is tied to the index.
>
> When we mmap(), there may not be a folio at this index yet, so any folio
> that gets allocated for this index then is taken from the right NUMA
> node based on the policy.
>
> If the folio is later truncated, the folio just goes back to the NUMA
> node, but the memory policy remains for the next folio to be allocated
> at this index.
Right. Though thinking about this more, there's no reason to have "index" in
the name, kvm_gmem_get_policy() is sufficient. E.g. we don't have "index" in
the name for things like kvm_get_vcpu().
Luckily, it's all made moot by Shivank's fixup :-)
> >> >> +{
> >> >> +#ifdef CONFIG_NUMA
> >> >> + struct mempolicy *mpol;
> >> >> +
> >> >> + mpol = mpol_shared_policy_lookup(&gi->policy, index);
> >> >> + return mpol ? mpol : get_task_policy(current);
> >> >
> >> > Should we be returning NULL if no shared policy was defined?
> >> >
> >> > By returning NULL, __filemap_get_folio_mpol() can handle the case where
> >> > cpuset_do_page_mem_spread().
> >> >
> >> > If we always return current's task policy, what if the user wants to use
> >> > cpuset_do_page_mem_spread()?
> >> >
> >>
> >> I initially followed shmem's approach here.
> >> I agree that returning NULL maintains consistency with the current default
> >> behavior of cpuset_do_page_mem_spread(), regardless of CONFIG_NUMA.
> >>
> >> I'm curious what could be the practical implications of cpuset_do_page_mem_spread()
> >> v/s get_task_policy() as the fallback?
> >
> > Userspace could enable page spreading on the task that triggers guest_memfd
> > allocation. I can't conjure up a reason to do that, but I've been surprised
> > more than once by KVM setups.
> >
> >> Which is more appropriate for guest_memfd when no policy is explicitly set
> >> via mbind()?
> >
> > I don't think we need to answer that question? Userspace _has_ set a policy,
> > just through cpuset, not via mbind(). So while I can't imagine there's a sane
> > use case for cpuset_do_page_mem_spread() with guest_memfd, I also don't see a
> > reason why KVM should effectively disallow it.
> >
> > And unless I'm missing something, allocation will eventually fallback to
> > get_task_policy() (in alloc_frozen_pages_noprof()), so by explicitly getting the
> > task policy in guest_memfd, KVM is doing _more_ work than necessary _and_ is
> > unnecessarily restricting usersepace.
> >
> > Add in that returning NULL would align this code with the ->get_policy hook (and
> > could be shared again, I assume), and my vote is definitely to return NULL and
> > not get in the way.
>
> ... although if we are going to return NULL then we can directly use
> mpol_shared_policy_lookup(), so the first discussion is moot.
Ha! Great minds think alike, right!!?!
> Though looking slightly into the future, shareability (aka memory
> attributes or shared/private state within guest_memfd inodes) are also
> keyed by index, and is a property of the index and not the folio (since
> shared/private state is defined even before folios are allocated for a
> given index.
Yeah, which further reinforces that having "index" in the function name is
superfluous (and potentially confusing), e.g. IMO the proposed helpers:
kvm_gmem_get_attributes()
kvm_gmem_is_private_mem()
kvm_gmem_is_shared_mem()
are far better than e.g.:
kvm_gmem_get_index_attributes()
kvm_gmem_is_index_private_mem()
kvm_gmem_is_index_shared_mem()
On 10/11/2025 3:27 AM, Ackerley Tng wrote:
> Sean Christopherson <seanjc@google.com> writes:
>
>> On Fri, Oct 10, 2025, Shivank Garg wrote:
>>>>> @@ -112,6 +114,19 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>>>>> return r;
>>>>> }
>>>>>
>>>>> +static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
>>>>> + pgoff_t index)
>>>>
>>>> How about kvm_gmem_get_index_policy() instead, since the policy is keyed
>>>> by index?
>>
>> But isn't the policy tied to the folio? I assume/hope that something will split
>> folios if they have different policies for their indices when a folio contains
>> more than one page. In other words, how will this work when hugepage support
>> comes along?
>>
>> So yeah, I agree that the lookup is keyed on the index, but conceptually aren't
>> we getting the policy for the folio? The index is a means to an end.
>>
>
> I think the policy is tied to the index.
>
> When we mmap(), there may not be a folio at this index yet, so any folio
> that gets allocated for this index then is taken from the right NUMA
> node based on the policy.
>
> If the folio is later truncated, the folio just goes back to the NUMA
> node, but the memory policy remains for the next folio to be allocated
> at this index.
>
>>>>> +{
>>>>> +#ifdef CONFIG_NUMA
>>>>> + struct mempolicy *mpol;
>>>>> +
>>>>> + mpol = mpol_shared_policy_lookup(&gi->policy, index);
>>>>> + return mpol ? mpol : get_task_policy(current);
>>>>
>>>> Should we be returning NULL if no shared policy was defined?
>>>>
>>>> By returning NULL, __filemap_get_folio_mpol() can handle the case where
>>>> cpuset_do_page_mem_spread().
>>>>
>>>> If we always return current's task policy, what if the user wants to use
>>>> cpuset_do_page_mem_spread()?
>>>>
>>>
>>> I initially followed shmem's approach here.
>>> I agree that returning NULL maintains consistency with the current default
>>> behavior of cpuset_do_page_mem_spread(), regardless of CONFIG_NUMA.
>>>
>>> I'm curious what could be the practical implications of cpuset_do_page_mem_spread()
>>> v/s get_task_policy() as the fallback?
>>
>> Userspace could enable page spreading on the task that triggers guest_memfd
>> allocation. I can't conjure up a reason to do that, but I've been surprised
>> more than once by KVM setups.
>>
>>> Which is more appropriate for guest_memfd when no policy is explicitly set
>>> via mbind()?
>>
>> I don't think we need to answer that question? Userspace _has_ set a policy,
>> just through cpuset, not via mbind(). So while I can't imagine there's a sane
>> use case for cpuset_do_page_mem_spread() with guest_memfd, I also don't see a
>> reason why KVM should effectively disallow it.
>>
>> And unless I'm missing something, allocation will eventually fallback to
>> get_task_policy() (in alloc_frozen_pages_noprof()), so by explicitly getting the
>> task policy in guest_memfd, KVM is doing _more_ work than necessary _and_ is
>> unnecessarily restricting usersepace.
>>
>> Add in that returning NULL would align this code with the ->get_policy hook (and
>> could be shared again, I assume), and my vote is definitely to return NULL and
>> not get in the way.
>
> ... although if we are going to return NULL then we can directly use
> mpol_shared_policy_lookup(), so the first discussion is moot.
>
>
> Though looking slightly into the future, shareability (aka memory
> attributes or shared/private state within guest_memfd inodes) are also
> keyed by index, and is a property of the index and not the folio (since
> shared/private state is defined even before folios are allocated for a
> given index.
Thanks Ackerley and Sean for catching this and giving suggestions.
With directly using mpol_shared_policy_lookup(), the code looks much cleaner.
virt/kvm/guest_memfd.c | 26 +++++++-------------------
1 file changed, 7 insertions(+), 19 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 95267c92983b..409cbfc6db13 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -114,19 +114,6 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
return r;
}
-static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
- pgoff_t index)
-{
-#ifdef CONFIG_NUMA
- struct mempolicy *mpol;
-
- mpol = mpol_shared_policy_lookup(&gi->policy, index);
- return mpol ? mpol : get_task_policy(current);
-#else
- return NULL;
-#endif
-}
-
/*
* Returns a locked folio on success. The caller is responsible for
* setting the up-to-date flag before the memory is mapped into the guest.
@@ -151,7 +138,7 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
if (!IS_ERR(folio))
return folio;
- policy = kvm_gmem_get_folio_policy(GMEM_I(inode), index);
+ policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
folio = __filemap_get_folio_mpol(inode->i_mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
mapping_gfp_mask(inode->i_mapping), policy);
@@ -462,11 +449,12 @@ static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
*pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
/*
- * Note! Directly return whatever the lookup returns, do NOT return
- * the current task's policy as is done when looking up the policy for
- * a specific folio. Kernel ABI for get_mempolicy() is to return
- * MPOL_DEFAULT when there is no defined policy, not whatever the
- * default policy resolves to.
+ * Return the memory policy for this index, or NULL if none is set.
+ *
+ * Returning NULL is important for the .get_policy kernel ABI:
+ * it indicates that no explicit policy has been set via mbind() for
+ * this memory. The caller can then replace NULL with the default
+ * memory policy instead of current's memory policy.
*/
return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
}
--
2.43.0
© 2016 - 2026 Red Hat, Inc.