Previously, guest-memfd allocations followed local NUMA node id in absence
of process mempolicy, resulting in arbitrary memory allocation.
Moreover, mbind() couldn't be used by the VMM as guest memory wasn't
mapped into userspace when allocation occurred.
Enable NUMA policy support by implementing vm_ops for guest-memfd mmap
operation. This allows the VMM to map the memory and use mbind() to set the
desired NUMA policy. The policy is stored in the inode structure via
kvm_gmem_inode_info, as memory policy is a property of the memory (struct
inode) itself. The policy is then retrieved via mpol_shared_policy_lookup()
and passed to filemap_grab_folio_mpol() to ensure that allocations follow
the specified memory policy.
This enables the VMM to control guest memory NUMA placement by calling
mbind() on the mapped memory regions, providing fine-grained control over
guest memory allocation across NUMA nodes.
The policy change only affect future allocations and does not migrate
existing memory. This matches mbind(2)'s default behavior which affects
only new allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL
flags, which are not supported for guest_memfd as it is unmovable.
Suggested-by: David Hildenbrand <david@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Shivank Garg <shivankg@amd.com>
---
virt/kvm/guest_memfd.c | 67 ++++++++++++++++++++++++++++++++++++++++--
1 file changed, 65 insertions(+), 2 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 356947d36a47..85edc597bb9f 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/kvm_host.h>
+#include <linux/mempolicy.h>
#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
@@ -18,6 +19,7 @@ struct kvm_gmem {
};
struct kvm_gmem_inode_info {
+ struct shared_policy policy;
struct inode vfs_inode;
};
@@ -26,6 +28,9 @@ static inline struct kvm_gmem_inode_info *KVM_GMEM_I(struct inode *inode)
return container_of(inode, struct kvm_gmem_inode_info, vfs_inode);
}
+static struct mempolicy *kvm_gmem_get_pgoff_policy(struct kvm_gmem_inode_info *info,
+ pgoff_t index);
+
/**
* folio_file_pfn - like folio_file_page, but return a pfn.
* @folio: The folio which contains this index.
@@ -112,7 +117,25 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
{
/* TODO: Support huge pages. */
- return filemap_grab_folio(inode->i_mapping, index);
+ struct mempolicy *policy;
+ struct folio *folio;
+
+ /*
+ * Fast-path: See if folio is already present in mapping to avoid
+ * policy_lookup.
+ */
+ folio = __filemap_get_folio(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED, 0);
+ if (!IS_ERR(folio))
+ return folio;
+
+ policy = kvm_gmem_get_pgoff_policy(KVM_GMEM_I(inode), index);
+ folio = __filemap_get_folio_mpol(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mapping_gfp_mask(inode->i_mapping), policy);
+ mpol_cond_put(policy);
+
+ return folio;
}
static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
@@ -372,8 +395,45 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
return ret;
}
+#ifdef CONFIG_NUMA
+static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+
+ return mpol_set_shared_policy(&KVM_GMEM_I(inode)->policy, vma, mpol);
+}
+
+static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
+ unsigned long addr, pgoff_t *pgoff)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+
+ *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
+ return mpol_shared_policy_lookup(&KVM_GMEM_I(inode)->policy, *pgoff);
+}
+
+static struct mempolicy *kvm_gmem_get_pgoff_policy(struct kvm_gmem_inode_info *info,
+ pgoff_t index)
+{
+ struct mempolicy *mpol;
+
+ mpol = mpol_shared_policy_lookup(&info->policy, index);
+ return mpol ? mpol : get_task_policy(current);
+}
+#else
+static struct mempolicy *kvm_gmem_get_pgoff_policy(struct kvm_gmem_inode_info *info,
+ pgoff_t index)
+{
+ return NULL;
+}
+#endif /* CONFIG_NUMA */
+
static const struct vm_operations_struct kvm_gmem_vm_ops = {
- .fault = kvm_gmem_fault_user_mapping,
+ .fault = kvm_gmem_fault_user_mapping,
+#ifdef CONFIG_NUMA
+ .get_policy = kvm_gmem_get_policy,
+ .set_policy = kvm_gmem_set_policy,
+#endif
};
static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
@@ -408,11 +468,14 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
if (!info)
return NULL;
+ mpol_shared_policy_init(&info->policy, NULL);
+
return &info->vfs_inode;
}
static void kvm_gmem_destroy_inode(struct inode *inode)
{
+ mpol_free_shared_policy(&KVM_GMEM_I(inode)->policy);
}
static void kvm_gmem_free_inode(struct inode *inode)
--
2.43.0
On Wed, Aug 27, 2025, Shivank Garg wrote:
> @@ -26,6 +28,9 @@ static inline struct kvm_gmem_inode_info *KVM_GMEM_I(struct inode *inode)
> return container_of(inode, struct kvm_gmem_inode_info, vfs_inode);
> }
>
> +static struct mempolicy *kvm_gmem_get_pgoff_policy(struct kvm_gmem_inode_info *info,
> + pgoff_t index);
> +
> /**
> * folio_file_pfn - like folio_file_page, but return a pfn.
> * @folio: The folio which contains this index.
> @@ -112,7 +117,25 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
> {
> /* TODO: Support huge pages. */
> - return filemap_grab_folio(inode->i_mapping, index);
> + struct mempolicy *policy;
> + struct folio *folio;
> +
> + /*
> + * Fast-path: See if folio is already present in mapping to avoid
> + * policy_lookup.
> + */
> + folio = __filemap_get_folio(inode->i_mapping, index,
> + FGP_LOCK | FGP_ACCESSED, 0);
> + if (!IS_ERR(folio))
> + return folio;
> +
> + policy = kvm_gmem_get_pgoff_policy(KVM_GMEM_I(inode), index);
> + folio = __filemap_get_folio_mpol(inode->i_mapping, index,
> + FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
> + mapping_gfp_mask(inode->i_mapping), policy);
> + mpol_cond_put(policy);
> +
> + return folio;
> }
>
> static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> @@ -372,8 +395,45 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
> return ret;
> }
>
> +#ifdef CONFIG_NUMA
> +static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
> +{
> + struct inode *inode = file_inode(vma->vm_file);
> +
> + return mpol_set_shared_policy(&KVM_GMEM_I(inode)->policy, vma, mpol);
> +}
> +
> +static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> + unsigned long addr, pgoff_t *pgoff)
> +{
> + struct inode *inode = file_inode(vma->vm_file);
> +
> + *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
> + return mpol_shared_policy_lookup(&KVM_GMEM_I(inode)->policy, *pgoff);
> +}
> +
> +static struct mempolicy *kvm_gmem_get_pgoff_policy(struct kvm_gmem_inode_info *info,
> + pgoff_t index)
I keep reading this is "page offset policy", as opposed to "policy given a page
offset". Another oddity that is confusing is that this helper explicitly does
get_task_policy(current), while kvm_gmem_get_policy() lets the caller do that.
The end result is the same, but I think it would be helpful for gmem to be
internally consistent.
If we have kvm_gmem_get_policy() use this helper, then we can kill two birds with
one stone:
static struct mempolicy *__kvm_gmem_get_policy(struct gmem_inode *gi,
pgoff_t index)
{
struct mempolicy *mpol;
mpol = mpol_shared_policy_lookup(&gi->policy, index);
return mpol ? mpol : get_task_policy(current);
}
static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
unsigned long addr, pgoff_t *pgoff)
{
*pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
return __kvm_gmem_get_policy(GMEM_I(file_inode(vma->vm_file)), *pgoff);
}
On Thu, Sep 25, 2025, Sean Christopherson wrote:
> On Wed, Aug 27, 2025, Shivank Garg wrote:
> > @@ -26,6 +28,9 @@ static inline struct kvm_gmem_inode_info *KVM_GMEM_I(struct inode *inode)
> > return container_of(inode, struct kvm_gmem_inode_info, vfs_inode);
> > }
> >
> > +static struct mempolicy *kvm_gmem_get_pgoff_policy(struct kvm_gmem_inode_info *info,
> > + pgoff_t index);
> > +
> > /**
> > * folio_file_pfn - like folio_file_page, but return a pfn.
> > * @folio: The folio which contains this index.
> > @@ -112,7 +117,25 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> > static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
> > {
> > /* TODO: Support huge pages. */
> > - return filemap_grab_folio(inode->i_mapping, index);
> > + struct mempolicy *policy;
> > + struct folio *folio;
> > +
> > + /*
> > + * Fast-path: See if folio is already present in mapping to avoid
> > + * policy_lookup.
> > + */
> > + folio = __filemap_get_folio(inode->i_mapping, index,
> > + FGP_LOCK | FGP_ACCESSED, 0);
> > + if (!IS_ERR(folio))
> > + return folio;
> > +
> > + policy = kvm_gmem_get_pgoff_policy(KVM_GMEM_I(inode), index);
> > + folio = __filemap_get_folio_mpol(inode->i_mapping, index,
> > + FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
> > + mapping_gfp_mask(inode->i_mapping), policy);
> > + mpol_cond_put(policy);
> > +
> > + return folio;
> > }
> >
> > static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> > @@ -372,8 +395,45 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
> > return ret;
> > }
> >
> > +#ifdef CONFIG_NUMA
> > +static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
> > +{
> > + struct inode *inode = file_inode(vma->vm_file);
> > +
> > + return mpol_set_shared_policy(&KVM_GMEM_I(inode)->policy, vma, mpol);
> > +}
> > +
> > +static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> > + unsigned long addr, pgoff_t *pgoff)
> > +{
> > + struct inode *inode = file_inode(vma->vm_file);
> > +
> > + *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
> > + return mpol_shared_policy_lookup(&KVM_GMEM_I(inode)->policy, *pgoff);
> > +}
> > +
> > +static struct mempolicy *kvm_gmem_get_pgoff_policy(struct kvm_gmem_inode_info *info,
> > + pgoff_t index)
>
> I keep reading this is "page offset policy", as opposed to "policy given a page
> offset". Another oddity that is confusing is that this helper explicitly does
> get_task_policy(current), while kvm_gmem_get_policy() lets the caller do that.
> The end result is the same, but I think it would be helpful for gmem to be
> internally consistent.
>
> If we have kvm_gmem_get_policy() use this helper, then we can kill two birds with
> one stone:
>
> static struct mempolicy *__kvm_gmem_get_policy(struct gmem_inode *gi,
> pgoff_t index)
> {
> struct mempolicy *mpol;
>
> mpol = mpol_shared_policy_lookup(&gi->policy, index);
> return mpol ? mpol : get_task_policy(current);
> }
>
> static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> unsigned long addr, pgoff_t *pgoff)
> {
> *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
>
> return __kvm_gmem_get_policy(GMEM_I(file_inode(vma->vm_file)), *pgoff);
Argh!!!!! This breaks the selftest because do_get_mempolicy() very specifically
falls back to the default_policy, NOT to the current task's policy. That is
*exactly* the type of subtle detail that needs to be commented, because there's
no way some random KVM developer is going to know that returning NULL here is
important with respect to get_mempolicy() ABI.
On a happier note, I'm very glad you wrote a testcase :-)
I've got this as fixup-to-the-fixup:
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index e796cc552a96..61130a52553f 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -114,8 +114,8 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
return r;
}
-static struct mempolicy *__kvm_gmem_get_policy(struct gmem_inode *gi,
- pgoff_t index)
+static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
+ pgoff_t index)
{
#ifdef CONFIG_NUMA
struct mempolicy *mpol;
@@ -151,7 +151,7 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
if (!IS_ERR(folio))
return folio;
- policy = __kvm_gmem_get_policy(GMEM_I(inode), index);
+ policy = kvm_gmem_get_folio_policy(GMEM_I(inode), index);
folio = __filemap_get_folio_mpol(inode->i_mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
mapping_gfp_mask(inode->i_mapping), policy);
@@ -431,9 +431,18 @@ static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpo
static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
unsigned long addr, pgoff_t *pgoff)
{
+ struct inode *inode = file_inode(vma->vm_file);
+
*pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
- return __kvm_gmem_get_policy(GMEM_I(file_inode(vma->vm_file)), *pgoff);
+ /*
+ * Note! Directly return whatever the lookup returns, do NOT return
+ * the current task's policy as is done when looking up the policy for
+ * a specific folio. Kernel ABI for get_mempolicy() is to return
+ * MPOL_DEFAULT when there is no defined policy, not whatever the
+ * default policy resolves to.
+ */
+ return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
}
#endif /* CONFIG_NUMA */
On Fri, Sep 26, 2025 at 12:36:27PM -0700, Sean Christopherson via Linux-f2fs-devel wrote:
> >
> > static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> > unsigned long addr, pgoff_t *pgoff)
> > {
> > *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
> >
> > return __kvm_gmem_get_policy(GMEM_I(file_inode(vma->vm_file)), *pgoff);
>
> Argh!!!!! This breaks the selftest because do_get_mempolicy() very specifically
> falls back to the default_policy, NOT to the current task's policy. That is
> *exactly* the type of subtle detail that needs to be commented, because there's
> no way some random KVM developer is going to know that returning NULL here is
> important with respect to get_mempolicy() ABI.
>
Do_get_mempolicy was designed to be accessed by the syscall, not as an in-kernel ABI.
get_task_policy also returns the default policy if there's nothing
there, because that's what applies.
I have dangerous questions:
why is __kvm_gmem_get_policy using
mpol_shared_policy_lookup()
instead of
get_vma_policy()
get_vma_policy does this all for you
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr, int order, pgoff_t *ilx)
{
struct mempolicy *pol;
pol = __get_vma_policy(vma, addr, ilx);
if (!pol)
pol = get_task_policy(current);
if (pol->mode == MPOL_INTERLEAVE ||
pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
*ilx += vma->vm_pgoff >> order;
*ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
}
return pol;
}
Of course you still have the same issue: get_task_policy will return the
default, because that's what applies.
do_get_mempolicy just seems like the completely incorrect interface to
be using here.
~Gregory
On Wed, Oct 15, 2025, Gregory Price wrote:
> On Fri, Sep 26, 2025 at 12:36:27PM -0700, Sean Christopherson via Linux-f2fs-devel wrote:
> > >
> > > static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> > > unsigned long addr, pgoff_t *pgoff)
> > > {
> > > *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
> > >
> > > return __kvm_gmem_get_policy(GMEM_I(file_inode(vma->vm_file)), *pgoff);
> >
> > Argh!!!!! This breaks the selftest because do_get_mempolicy() very specifically
> > falls back to the default_policy, NOT to the current task's policy. That is
> > *exactly* the type of subtle detail that needs to be commented, because there's
> > no way some random KVM developer is going to know that returning NULL here is
> > important with respect to get_mempolicy() ABI.
> >
>
> Do_get_mempolicy was designed to be accessed by the syscall, not as an
> in-kernel ABI.
Ya, by "get_mempolicy() ABI" I meant the uABI for the get_mempolicy syscall.
> get_task_policy also returns the default policy if there's nothing
> there, because that's what applies.
>
> I have dangerous questions:
Not dangerous at all, I find them very helpful!
> why is __kvm_gmem_get_policy using
> mpol_shared_policy_lookup()
> instead of
> get_vma_policy()
With the disclaimer that I haven't followed the gory details of this series super
closely, my understanding is...
Because the VMA is a means to an end, and we want the policy to persist even if
the VMA goes away.
With guest_memfd, KVM effectively inverts the standard MMU model. Instead of mm/
being the primary MMU and KVM being a secondary MMU, guest_memfd is the primary
MMU and any VMAs are secondary (mostly; it's probably more like 1a and 1b). This
allows KVM to map guest_memfd memory into a guest without a VMA, or with more
permissions than are granted to host userspace, e.g. guest_memfd memory could be
writable by the guest, but read-only for userspace.
But we still want to support things like mbind() so that userspace can ensure
guest_memfd allocations align with the vNUMA topology presented to the guest,
or are bound to the NUMA node where the VM will run. We considered adding equivalent
file-based syscalls, e.g. fbind(), but IIRC the consensus was that doing so was
unnecessary (and potentially messy?) since we were planning on eventually adding
mmap() support to guest_memfd anyways.
> get_vma_policy does this all for you
I assume that doesn't work if the intent is for new VMAs to pick up the existing
policy from guest_memfd? And more importantly, guest_memfd needs to hook
->set_policy so that changes through e.g. mbind() persist beyond the lifetime of
the VMA.
> struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
> unsigned long addr, int order, pgoff_t *ilx)
> {
> struct mempolicy *pol;
>
> pol = __get_vma_policy(vma, addr, ilx);
> if (!pol)
> pol = get_task_policy(current);
> if (pol->mode == MPOL_INTERLEAVE ||
> pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
> *ilx += vma->vm_pgoff >> order;
> *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
> }
> return pol;
> }
>
> Of course you still have the same issue: get_task_policy will return the
> default, because that's what applies.
>
> do_get_mempolicy just seems like the completely incorrect interface to
> be using here.
On Wed, Oct 15, 2025 at 03:48:38PM -0700, Sean Christopherson wrote:
> On Wed, Oct 15, 2025, Gregory Price wrote:
> > why is __kvm_gmem_get_policy using
> > mpol_shared_policy_lookup()
> > instead of
> > get_vma_policy()
>
> With the disclaimer that I haven't followed the gory details of this series super
> closely, my understanding is...
>
> Because the VMA is a means to an end, and we want the policy to persist even if
> the VMA goes away.
>
Ah, you know, now that i've taken a close look, I can see that you've
essentially modeled this after ipc/shm.c | mm/shmem.c pattern.
What's had me scratching my chin is that shm/shmem already has a
mempolicy pattern which ends up using folio_alloc_mpol() where the
relationship is
tmpfs: sb_info->mpol = default set by user
create_file: inode inherits copy of sb_info->mpol
fault: mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id())
So this inode mempolicy in guest_memfd is really acting more as a the
filesystem-default mempolicy, which you want to survive even if userland
never maps the memory/unmaps the memory.
So the relationship is more like
guest_memfd -> creates fd/inode <- copies task mempolicy (if set)
vm: allocates memory via filemap_get_folio_mpol()
userland mmap(fd):
creates new inode<->vma mapping
vma->mpol = kvm_gmem_get_policy()
calls to set/get_policy/mbind go through kvm_gmem
This makes sense, sorry for the noise. Have been tearing apart
mempolicy lately and I'm disliking the general odor coming off
it as a whole. I had been poking at adding mempolicy support to
filemap and you got there first. Overall I think there are still
other problems with mempolicy, but this all looks fine as-is.
~Gregory
On 10/16/2025 4:18 AM, Sean Christopherson wrote:
> On Wed, Oct 15, 2025, Gregory Price wrote:
>> On Fri, Sep 26, 2025 at 12:36:27PM -0700, Sean Christopherson via Linux-f2fs-devel wrote:
>>>>
>>>> static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
>>>> unsigned long addr, pgoff_t *pgoff)
>>>> {
>>>> *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
>>>>
>>>> return __kvm_gmem_get_policy(GMEM_I(file_inode(vma->vm_file)), *pgoff);
>>>
>>> Argh!!!!! This breaks the selftest because do_get_mempolicy() very specifically
>>> falls back to the default_policy, NOT to the current task's policy. That is
>>> *exactly* the type of subtle detail that needs to be commented, because there's
>>> no way some random KVM developer is going to know that returning NULL here is
>>> important with respect to get_mempolicy() ABI.
>>>
>>
>> Do_get_mempolicy was designed to be accessed by the syscall, not as an
>> in-kernel ABI.
>
> Ya, by "get_mempolicy() ABI" I meant the uABI for the get_mempolicy syscall.
>
>> get_task_policy also returns the default policy if there's nothing
>> there, because that's what applies.
>>
>> I have dangerous questions:
>
> Not dangerous at all, I find them very helpful!
>
>> why is __kvm_gmem_get_policy using
>> mpol_shared_policy_lookup()
>> instead of
>> get_vma_policy()
>
> With the disclaimer that I haven't followed the gory details of this series super
> closely, my understanding is...
>
> Because the VMA is a means to an end, and we want the policy to persist even if
> the VMA goes away.
>
> With guest_memfd, KVM effectively inverts the standard MMU model. Instead of mm/
> being the primary MMU and KVM being a secondary MMU, guest_memfd is the primary
> MMU and any VMAs are secondary (mostly; it's probably more like 1a and 1b). This
> allows KVM to map guest_memfd memory into a guest without a VMA, or with more
> permissions than are granted to host userspace, e.g. guest_memfd memory could be
> writable by the guest, but read-only for userspace.
>
> But we still want to support things like mbind() so that userspace can ensure
> guest_memfd allocations align with the vNUMA topology presented to the guest,
> or are bound to the NUMA node where the VM will run. We considered adding equivalent
> file-based syscalls, e.g. fbind(), but IIRC the consensus was that doing so was
> unnecessary (and potentially messy?) since we were planning on eventually adding
> mmap() support to guest_memfd anyways.
>
>> get_vma_policy does this all for you
>
> I assume that doesn't work if the intent is for new VMAs to pick up the existing
> policy from guest_memfd? And more importantly, guest_memfd needs to hook
> ->set_policy so that changes through e.g. mbind() persist beyond the lifetime of
> the VMA.
>
Additionally, the shared_policy based design enables range-based policies via its RB-tree
implementation. IIUC, this will not work with VMA-specific policy design.
>> struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
>> unsigned long addr, int order, pgoff_t *ilx)
>> {
>> struct mempolicy *pol;
>>
>> pol = __get_vma_policy(vma, addr, ilx);
>> if (!pol)
>> pol = get_task_policy(current);
>> if (pol->mode == MPOL_INTERLEAVE ||
>> pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
>> *ilx += vma->vm_pgoff >> order;
>> *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
>> }
>> return pol;
>> }
>>
>> Of course you still have the same issue: get_task_policy will return the
>> default, because that's what applies.
>>
>> do_get_mempolicy just seems like the completely incorrect interface to
>> be using here.
© 2016 - 2026 Red Hat, Inc.