Previously, guest-memfd allocations followed local NUMA node id in absence
of process mempolicy, resulting in arbitrary memory allocation.
Moreover, mbind() couldn't be used since memory wasn't mapped to userspace
in the VMM.
Enable NUMA policy support by implementing vm_ops for guest-memfd mmap
operation. This allows the VMM to map the memory and use mbind() to set
the desired NUMA policy. The policy is then retrieved via
mpol_shared_policy_lookup() and passed to filemap_grab_folio_mpol() to
ensure that allocations follow the specified memory policy.
This enables the VMM to control guest memory NUMA placement by calling
mbind() on the mapped memory regions, providing fine-grained control over
guest memory allocation across NUMA nodes.
The policy change only affect future allocations and does not migrate
existing memory. This matches mbind(2)'s default behavior which affects
only new allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL
flags, which are not supported for guest_memfd as it is unmovable.
Suggested-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Shivank Garg <shivankg@amd.com>
---
virt/kvm/guest_memfd.c | 76 +++++++++++++++++++++++++++++++++++++++++-
1 file changed, 75 insertions(+), 1 deletion(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index f18176976ae3..b3a8819117a0 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -2,6 +2,7 @@
#include <linux/backing-dev.h>
#include <linux/falloc.h>
#include <linux/kvm_host.h>
+#include <linux/mempolicy.h>
#include <linux/pagemap.h>
#include <linux/anon_inodes.h>
@@ -11,8 +12,12 @@ struct kvm_gmem {
struct kvm *kvm;
struct xarray bindings;
struct list_head entry;
+ struct shared_policy policy;
};
+static struct mempolicy *kvm_gmem_get_pgoff_policy(struct kvm_gmem *gmem,
+ pgoff_t index);
+
/**
* folio_file_pfn - like folio_file_page, but return a pfn.
* @folio: The folio which contains this index.
@@ -99,7 +104,25 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
{
/* TODO: Support huge pages. */
- return filemap_grab_folio(file_inode(file)->i_mapping, index);
+ struct kvm_gmem *gmem = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct mempolicy *policy;
+ struct folio *folio;
+
+ /*
+ * Fast-path: See if folio is already present in mapping to avoid
+ * policy_lookup.
+ */
+ folio = __filemap_get_folio(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED, 0);
+ if (!IS_ERR(folio))
+ return folio;
+
+ policy = kvm_gmem_get_pgoff_policy(gmem, index);
+ folio = filemap_grab_folio_mpol(inode->i_mapping, index, policy);
+ mpol_cond_put(policy);
+
+ return folio;
}
static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
@@ -291,6 +314,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
mutex_unlock(&kvm->slots_lock);
xa_destroy(&gmem->bindings);
+ mpol_free_shared_policy(&gmem->policy);
kfree(gmem);
kvm_put_kvm(kvm);
@@ -312,8 +336,57 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
{
return gfn - slot->base_gfn + slot->gmem.pgoff;
}
+#ifdef CONFIG_NUMA
+static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+{
+ struct file *file = vma->vm_file;
+ struct kvm_gmem *gmem = file->private_data;
+
+ return mpol_set_shared_policy(&gmem->policy, vma, new);
+}
+
+static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
+ unsigned long addr, pgoff_t *pgoff)
+{
+ struct file *file = vma->vm_file;
+ struct kvm_gmem *gmem = file->private_data;
+
+ *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
+ return mpol_shared_policy_lookup(&gmem->policy, *pgoff);
+}
+
+static struct mempolicy *kvm_gmem_get_pgoff_policy(struct kvm_gmem *gmem,
+ pgoff_t index)
+{
+ struct mempolicy *mpol;
+
+ mpol = mpol_shared_policy_lookup(&gmem->policy, index);
+ return mpol ? mpol : get_task_policy(current);
+}
+#else
+static struct mempolicy *kvm_gmem_get_pgoff_policy(struct kvm_gmem *gmem,
+ pgoff_t index)
+{
+ return NULL;
+}
+#endif /* CONFIG_NUMA */
+
+static const struct vm_operations_struct kvm_gmem_vm_ops = {
+#ifdef CONFIG_NUMA
+ .get_policy = kvm_gmem_get_policy,
+ .set_policy = kvm_gmem_set_policy,
+#endif
+};
+
+static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ file_accessed(file);
+ vma->vm_ops = &kvm_gmem_vm_ops;
+ return 0;
+}
static struct file_operations kvm_gmem_fops = {
+ .mmap = kvm_gmem_mmap,
.open = generic_file_open,
.release = kvm_gmem_release,
.fallocate = kvm_gmem_fallocate,
@@ -446,6 +519,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
kvm_get_kvm(kvm);
gmem->kvm = kvm;
xa_init(&gmem->bindings);
+ mpol_shared_policy_init(&gmem->policy, NULL);
list_add(&gmem->entry, &inode->i_mapping->i_private_list);
fd_install(fd, file);
--
2.34.1
Shivank Garg <shivankg@amd.com> writes:
> Previously, guest-memfd allocations followed local NUMA node id in absence
> of process mempolicy, resulting in arbitrary memory allocation.
> Moreover, mbind() couldn't be used since memory wasn't mapped to userspace
> in the VMM.
>
> Enable NUMA policy support by implementing vm_ops for guest-memfd mmap
> operation. This allows the VMM to map the memory and use mbind() to set
> the desired NUMA policy. The policy is then retrieved via
> mpol_shared_policy_lookup() and passed to filemap_grab_folio_mpol() to
> ensure that allocations follow the specified memory policy.
>
> This enables the VMM to control guest memory NUMA placement by calling
> mbind() on the mapped memory regions, providing fine-grained control over
> guest memory allocation across NUMA nodes.
>
> The policy change only affect future allocations and does not migrate
> existing memory. This matches mbind(2)'s default behavior which affects
> only new allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL
> flags, which are not supported for guest_memfd as it is unmovable.
>
> Suggested-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Shivank Garg <shivankg@amd.com>
> ---
> virt/kvm/guest_memfd.c | 76 +++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 75 insertions(+), 1 deletion(-)
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index f18176976ae3..b3a8819117a0 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -2,6 +2,7 @@
> #include <linux/backing-dev.h>
> #include <linux/falloc.h>
> #include <linux/kvm_host.h>
> +#include <linux/mempolicy.h>
> #include <linux/pagemap.h>
> #include <linux/anon_inodes.h>
>
> @@ -11,8 +12,12 @@ struct kvm_gmem {
> struct kvm *kvm;
> struct xarray bindings;
> struct list_head entry;
> + struct shared_policy policy;
> };
>
struct shared_policy should be stored on the inode rather than the file,
since the memory policy is a property of the memory (struct inode),
rather than a property of how the memory is used for a given VM (struct
file).
When the shared_policy is stored on the inode, intra-host migration [1]
will work correctly, since the while the inode will be transferred from
one VM (struct kvm) to another, the file (a VM's view/bindings of the
memory) will be recreated for the new VM.
I'm thinking of having a patch like this [2] to introduce inodes.
With this, we shouldn't need to pass file pointers instead of inode
pointers.
> +static struct mempolicy *kvm_gmem_get_pgoff_policy(struct kvm_gmem *gmem,
> + pgoff_t index);
> +
> /**
> * folio_file_pfn - like folio_file_page, but return a pfn.
> * @folio: The folio which contains this index.
> @@ -99,7 +104,25 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
> {
> /* TODO: Support huge pages. */
> - return filemap_grab_folio(file_inode(file)->i_mapping, index);
> + struct kvm_gmem *gmem = file->private_data;
> + struct inode *inode = file_inode(file);
> + struct mempolicy *policy;
> + struct folio *folio;
> +
> + /*
> + * Fast-path: See if folio is already present in mapping to avoid
> + * policy_lookup.
> + */
> + folio = __filemap_get_folio(inode->i_mapping, index,
> + FGP_LOCK | FGP_ACCESSED, 0);
> + if (!IS_ERR(folio))
> + return folio;
> +
> + policy = kvm_gmem_get_pgoff_policy(gmem, index);
> + folio = filemap_grab_folio_mpol(inode->i_mapping, index, policy);
> + mpol_cond_put(policy);
> +
> + return folio;
> }
>
> static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> @@ -291,6 +314,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
> mutex_unlock(&kvm->slots_lock);
>
> xa_destroy(&gmem->bindings);
> + mpol_free_shared_policy(&gmem->policy);
> kfree(gmem);
>
> kvm_put_kvm(kvm);
> @@ -312,8 +336,57 @@ static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
> {
> return gfn - slot->base_gfn + slot->gmem.pgoff;
> }
> +#ifdef CONFIG_NUMA
> +static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
> +{
> + struct file *file = vma->vm_file;
> + struct kvm_gmem *gmem = file->private_data;
> +
> + return mpol_set_shared_policy(&gmem->policy, vma, new);
> +}
> +
> +static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
> + unsigned long addr, pgoff_t *pgoff)
> +{
> + struct file *file = vma->vm_file;
> + struct kvm_gmem *gmem = file->private_data;
> +
> + *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
> + return mpol_shared_policy_lookup(&gmem->policy, *pgoff);
> +}
> +
> +static struct mempolicy *kvm_gmem_get_pgoff_policy(struct kvm_gmem *gmem,
> + pgoff_t index)
> +{
> + struct mempolicy *mpol;
> +
> + mpol = mpol_shared_policy_lookup(&gmem->policy, index);
> + return mpol ? mpol : get_task_policy(current);
> +}
> +#else
> +static struct mempolicy *kvm_gmem_get_pgoff_policy(struct kvm_gmem *gmem,
> + pgoff_t index)
> +{
> + return NULL;
> +}
> +#endif /* CONFIG_NUMA */
> +
> +static const struct vm_operations_struct kvm_gmem_vm_ops = {
> +#ifdef CONFIG_NUMA
> + .get_policy = kvm_gmem_get_policy,
> + .set_policy = kvm_gmem_set_policy,
> +#endif
> +};
> +
> +static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> + file_accessed(file);
> + vma->vm_ops = &kvm_gmem_vm_ops;
> + return 0;
> +}
>
> static struct file_operations kvm_gmem_fops = {
> + .mmap = kvm_gmem_mmap,
> .open = generic_file_open,
> .release = kvm_gmem_release,
> .fallocate = kvm_gmem_fallocate,
> @@ -446,6 +519,7 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
> kvm_get_kvm(kvm);
> gmem->kvm = kvm;
> xa_init(&gmem->bindings);
> + mpol_shared_policy_init(&gmem->policy, NULL);
> list_add(&gmem->entry, &inode->i_mapping->i_private_list);
>
> fd_install(fd, file);
[1] https://lore.kernel.org/lkml/cover.1691446946.git.ackerleytng@google.com/T/
[2] https://lore.kernel.org/all/d1940d466fc69472c8b6dda95df2e0522b2d8744.1726009989.git.ackerleytng@google.com/
On 2/28/25 18:25, Ackerley Tng wrote:
> Shivank Garg <shivankg@amd.com> writes:
>
>> Previously, guest-memfd allocations followed local NUMA node id in absence
>> of process mempolicy, resulting in arbitrary memory allocation.
>> Moreover, mbind() couldn't be used since memory wasn't mapped to userspace
>> in the VMM.
>>
>> Enable NUMA policy support by implementing vm_ops for guest-memfd mmap
>> operation. This allows the VMM to map the memory and use mbind() to set
>> the desired NUMA policy. The policy is then retrieved via
>> mpol_shared_policy_lookup() and passed to filemap_grab_folio_mpol() to
>> ensure that allocations follow the specified memory policy.
>>
>> This enables the VMM to control guest memory NUMA placement by calling
>> mbind() on the mapped memory regions, providing fine-grained control over
>> guest memory allocation across NUMA nodes.
>>
>> The policy change only affect future allocations and does not migrate
>> existing memory. This matches mbind(2)'s default behavior which affects
>> only new allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL
>> flags, which are not supported for guest_memfd as it is unmovable.
>>
>> Suggested-by: David Hildenbrand <david@redhat.com>
>> Signed-off-by: Shivank Garg <shivankg@amd.com>
>> ---
>> virt/kvm/guest_memfd.c | 76 +++++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 75 insertions(+), 1 deletion(-)
>>
>> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>> index f18176976ae3..b3a8819117a0 100644
>> --- a/virt/kvm/guest_memfd.c
>> +++ b/virt/kvm/guest_memfd.c
>> @@ -2,6 +2,7 @@
>> #include <linux/backing-dev.h>
>> #include <linux/falloc.h>
>> #include <linux/kvm_host.h>
>> +#include <linux/mempolicy.h>
>> #include <linux/pagemap.h>
>> #include <linux/anon_inodes.h>
>>
>> @@ -11,8 +12,12 @@ struct kvm_gmem {
>> struct kvm *kvm;
>> struct xarray bindings;
>> struct list_head entry;
>> + struct shared_policy policy;
>> };
>>
>
> struct shared_policy should be stored on the inode rather than the file,
> since the memory policy is a property of the memory (struct inode),
> rather than a property of how the memory is used for a given VM (struct
> file).
That makes sense. AFAICS shmem also uses inodes to store policy.
> When the shared_policy is stored on the inode, intra-host migration [1]
> will work correctly, since the while the inode will be transferred from
> one VM (struct kvm) to another, the file (a VM's view/bindings of the
> memory) will be recreated for the new VM.
>
> I'm thinking of having a patch like this [2] to introduce inodes.
shmem has it easier by already having inodes
> With this, we shouldn't need to pass file pointers instead of inode
> pointers.
Any downsides, besides more work needed? Or is it feasible to do it using
files now and convert to inodes later?
Feels like something that must have been discussed already, but I don't
recall specifics.
Vlastimil Babka <vbabka@suse.cz> writes:
> On 2/28/25 18:25, Ackerley Tng wrote:
>> Shivank Garg <shivankg@amd.com> writes:
>>
>>> Previously, guest-memfd allocations followed local NUMA node id in absence
>>> of process mempolicy, resulting in arbitrary memory allocation.
>>> Moreover, mbind() couldn't be used since memory wasn't mapped to userspace
>>> in the VMM.
>>>
>>> Enable NUMA policy support by implementing vm_ops for guest-memfd mmap
>>> operation. This allows the VMM to map the memory and use mbind() to set
>>> the desired NUMA policy. The policy is then retrieved via
>>> mpol_shared_policy_lookup() and passed to filemap_grab_folio_mpol() to
>>> ensure that allocations follow the specified memory policy.
>>>
>>> This enables the VMM to control guest memory NUMA placement by calling
>>> mbind() on the mapped memory regions, providing fine-grained control over
>>> guest memory allocation across NUMA nodes.
>>>
>>> The policy change only affect future allocations and does not migrate
>>> existing memory. This matches mbind(2)'s default behavior which affects
>>> only new allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL
>>> flags, which are not supported for guest_memfd as it is unmovable.
>>>
>>> Suggested-by: David Hildenbrand <david@redhat.com>
>>> Signed-off-by: Shivank Garg <shivankg@amd.com>
>>> ---
>>> virt/kvm/guest_memfd.c | 76 +++++++++++++++++++++++++++++++++++++++++-
>>> 1 file changed, 75 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>>> index f18176976ae3..b3a8819117a0 100644
>>> --- a/virt/kvm/guest_memfd.c
>>> +++ b/virt/kvm/guest_memfd.c
>>> @@ -2,6 +2,7 @@
>>> #include <linux/backing-dev.h>
>>> #include <linux/falloc.h>
>>> #include <linux/kvm_host.h>
>>> +#include <linux/mempolicy.h>
>>> #include <linux/pagemap.h>
>>> #include <linux/anon_inodes.h>
>>>
>>> @@ -11,8 +12,12 @@ struct kvm_gmem {
>>> struct kvm *kvm;
>>> struct xarray bindings;
>>> struct list_head entry;
>>> + struct shared_policy policy;
>>> };
>>>
>>
>> struct shared_policy should be stored on the inode rather than the file,
>> since the memory policy is a property of the memory (struct inode),
>> rather than a property of how the memory is used for a given VM (struct
>> file).
>
> That makes sense. AFAICS shmem also uses inodes to store policy.
>
>> When the shared_policy is stored on the inode, intra-host migration [1]
>> will work correctly, since the while the inode will be transferred from
>> one VM (struct kvm) to another, the file (a VM's view/bindings of the
>> memory) will be recreated for the new VM.
>>
>> I'm thinking of having a patch like this [2] to introduce inodes.
>
> shmem has it easier by already having inodes
>
>> With this, we shouldn't need to pass file pointers instead of inode
>> pointers.
>
> Any downsides, besides more work needed? Or is it feasible to do it using
> files now and convert to inodes later?
>
> Feels like something that must have been discussed already, but I don't
> recall specifics.
Here's where Sean described file vs inode: "The inode is effectively the
raw underlying physical storage, while the file is the VM's view of that
storage." [1].
I guess you're right that for now there is little distinction between
file and inode and using file should be feasible, but I feel that this
dilutes the original intent. Something like [2] doesn't seem like too
big of a change and could perhaps be included earlier rather than later,
since it will also contribute to support for restricted mapping [3].
[1] https://lore.kernel.org/all/ZLGiEfJZTyl7M8mS@google.com/
[2] https://lore.kernel.org/all/d1940d466fc69472c8b6dda95df2e0522b2d8744.1726009989.git.ackerleytng@google.com/
[3] https://lore.kernel.org/all/20250117163001.2326672-1-tabba@google.com/T/
On Tue, Mar 04, 2025, Ackerley Tng wrote: > Vlastimil Babka <vbabka@suse.cz> writes: > >> struct shared_policy should be stored on the inode rather than the file, > >> since the memory policy is a property of the memory (struct inode), > >> rather than a property of how the memory is used for a given VM (struct > >> file). > > > > That makes sense. AFAICS shmem also uses inodes to store policy. > > > >> When the shared_policy is stored on the inode, intra-host migration [1] > >> will work correctly, since the while the inode will be transferred from > >> one VM (struct kvm) to another, the file (a VM's view/bindings of the > >> memory) will be recreated for the new VM. > >> > >> I'm thinking of having a patch like this [2] to introduce inodes. > > > > shmem has it easier by already having inodes > > > >> With this, we shouldn't need to pass file pointers instead of inode > >> pointers. > > > > Any downsides, besides more work needed? Or is it feasible to do it using > > files now and convert to inodes later? > > > > Feels like something that must have been discussed already, but I don't > > recall specifics. > > Here's where Sean described file vs inode: "The inode is effectively the > raw underlying physical storage, while the file is the VM's view of that > storage." [1]. > > I guess you're right that for now there is little distinction between > file and inode and using file should be feasible, but I feel that this > dilutes the original intent. Hmm, and using the file would be actively problematic at some point. One could argue that NUMA policy is property of the VM accessing the memory, i.e. that two VMs mapping the same guest_memfd could want different policies. But in practice, that would allow for conflicting requirements, e.g. different policies in each VM for the same chunk of memory, and would likely lead to surprising behavior due to having to manually do mbind() for every VM/file view. > Something like [2] doesn't seem like too big of a change and could perhaps be > included earlier rather than later, since it will also contribute to support > for restricted mapping [3]. > > [1] https://lore.kernel.org/all/ZLGiEfJZTyl7M8mS@google.com/ > [2] https://lore.kernel.org/all/d1940d466fc69472c8b6dda95df2e0522b2d8744.1726009989.git.ackerleytng@google.com/ > [3] https://lore.kernel.org/all/20250117163001.2326672-1-tabba@google.com/T/
On 04.03.25 16:30, Sean Christopherson wrote: > On Tue, Mar 04, 2025, Ackerley Tng wrote: >> Vlastimil Babka <vbabka@suse.cz> writes: >>>> struct shared_policy should be stored on the inode rather than the file, >>>> since the memory policy is a property of the memory (struct inode), >>>> rather than a property of how the memory is used for a given VM (struct >>>> file). >>> >>> That makes sense. AFAICS shmem also uses inodes to store policy. >>> >>>> When the shared_policy is stored on the inode, intra-host migration [1] >>>> will work correctly, since the while the inode will be transferred from >>>> one VM (struct kvm) to another, the file (a VM's view/bindings of the >>>> memory) will be recreated for the new VM. >>>> >>>> I'm thinking of having a patch like this [2] to introduce inodes. >>> >>> shmem has it easier by already having inodes >>> >>>> With this, we shouldn't need to pass file pointers instead of inode >>>> pointers. >>> >>> Any downsides, besides more work needed? Or is it feasible to do it using >>> files now and convert to inodes later? >>> >>> Feels like something that must have been discussed already, but I don't >>> recall specifics. >> >> Here's where Sean described file vs inode: "The inode is effectively the >> raw underlying physical storage, while the file is the VM's view of that >> storage." [1]. >> >> I guess you're right that for now there is little distinction between >> file and inode and using file should be feasible, but I feel that this >> dilutes the original intent. > > Hmm, and using the file would be actively problematic at some point. One could > argue that NUMA policy is property of the VM accessing the memory, i.e. that two > VMs mapping the same guest_memfd could want different policies. But in practice, > that would allow for conflicting requirements, e.g. different policies in each > VM for the same chunk of memory, and would likely lead to surprising behavior due > to having to manually do mbind() for every VM/file view. I think that's the same behavior with shmem? I mean, if you have two people asking for different things for the same MAP_SHARE file range, surprises are unavoidable. Or am I missing something? -- Cheers, David / dhildenb
On Tue, Mar 04, 2025, David Hildenbrand wrote: > On 04.03.25 16:30, Sean Christopherson wrote: > > On Tue, Mar 04, 2025, Ackerley Tng wrote: > > > Vlastimil Babka <vbabka@suse.cz> writes: > > > > > struct shared_policy should be stored on the inode rather than the file, > > > > > since the memory policy is a property of the memory (struct inode), > > > > > rather than a property of how the memory is used for a given VM (struct > > > > > file). > > > > > > > > That makes sense. AFAICS shmem also uses inodes to store policy. > > > > > > > > > When the shared_policy is stored on the inode, intra-host migration [1] > > > > > will work correctly, since the while the inode will be transferred from > > > > > one VM (struct kvm) to another, the file (a VM's view/bindings of the > > > > > memory) will be recreated for the new VM. > > > > > > > > > > I'm thinking of having a patch like this [2] to introduce inodes. > > > > > > > > shmem has it easier by already having inodes > > > > > > > > > With this, we shouldn't need to pass file pointers instead of inode > > > > > pointers. > > > > > > > > Any downsides, besides more work needed? Or is it feasible to do it using > > > > files now and convert to inodes later? > > > > > > > > Feels like something that must have been discussed already, but I don't > > > > recall specifics. > > > > > > Here's where Sean described file vs inode: "The inode is effectively the > > > raw underlying physical storage, while the file is the VM's view of that > > > storage." [1]. > > > > > > I guess you're right that for now there is little distinction between > > > file and inode and using file should be feasible, but I feel that this > > > dilutes the original intent. > > > > Hmm, and using the file would be actively problematic at some point. One could > > argue that NUMA policy is property of the VM accessing the memory, i.e. that two > > VMs mapping the same guest_memfd could want different policies. But in practice, > > that would allow for conflicting requirements, e.g. different policies in each > > VM for the same chunk of memory, and would likely lead to surprising behavior due > > to having to manually do mbind() for every VM/file view. > > I think that's the same behavior with shmem? I mean, if you have two people > asking for different things for the same MAP_SHARE file range, surprises are > unavoidable. Yeah, I was specifically thinking of the case where a secondary mapping doesn't do mbind() at all, e.g. could end up effectively polluting guest_memfd with "bad" allocations.
On 3/4/2025 10:29 PM, Sean Christopherson wrote: > On Tue, Mar 04, 2025, David Hildenbrand wrote: >> On 04.03.25 16:30, Sean Christopherson wrote: >>> On Tue, Mar 04, 2025, Ackerley Tng wrote: >>>> Vlastimil Babka <vbabka@suse.cz> writes: >>>>>> struct shared_policy should be stored on the inode rather than the file, >>>>>> since the memory policy is a property of the memory (struct inode), >>>>>> rather than a property of how the memory is used for a given VM (struct >>>>>> file). >>>>> >>>>> That makes sense. AFAICS shmem also uses inodes to store policy. >>>>> >>>>>> When the shared_policy is stored on the inode, intra-host migration [1] >>>>>> will work correctly, since the while the inode will be transferred from >>>>>> one VM (struct kvm) to another, the file (a VM's view/bindings of the >>>>>> memory) will be recreated for the new VM. >>>>>> >>>>>> I'm thinking of having a patch like this [2] to introduce inodes. >>>>> >>>>> shmem has it easier by already having inodes >>>>> >>>>>> With this, we shouldn't need to pass file pointers instead of inode >>>>>> pointers. >>>>> >>>>> Any downsides, besides more work needed? Or is it feasible to do it using >>>>> files now and convert to inodes later? >>>>> >>>>> Feels like something that must have been discussed already, but I don't >>>>> recall specifics. >>>> >>>> Here's where Sean described file vs inode: "The inode is effectively the >>>> raw underlying physical storage, while the file is the VM's view of that >>>> storage." [1]. >>>> >>>> I guess you're right that for now there is little distinction between >>>> file and inode and using file should be feasible, but I feel that this >>>> dilutes the original intent. >>> >>> Hmm, and using the file would be actively problematic at some point. One could >>> argue that NUMA policy is property of the VM accessing the memory, i.e. that two >>> VMs mapping the same guest_memfd could want different policies. But in practice, >>> that would allow for conflicting requirements, e.g. different policies in each >>> VM for the same chunk of memory, and would likely lead to surprising behavior due >>> to having to manually do mbind() for every VM/file view. >> >> I think that's the same behavior with shmem? I mean, if you have two people >> asking for different things for the same MAP_SHARE file range, surprises are >> unavoidable. > > Yeah, I was specifically thinking of the case where a secondary mapping doesn't > do mbind() at all, e.g. could end up effectively polluting guest_memfd with "bad" > allocations. Thank you for the feedback. I agree that storing the policy in the inode is the correct approach, as it aligns with shmem's behavior. I now understand that keeping the policy in file-private data could lead to surprising behavior, especially with multiple VMs mapping the same guest_memfd. The inode-based approach also makes sense from a long-term perspective, especially with upcoming restricted mapping support. I'll pick the Ackerley's patch[1] to add support for gmem inodes. With this patch, it does not seem overly complex to implement to policy storage in inodes. I'll test this approach and submit a revised patch shortly. [1] https://lore.kernel.org/all/d1940d466fc69472c8b6dda95df2e0522b2d8744.1726009989.git.ackerleytng@google.com/ Thanks, Shivank
© 2016 - 2026 Red Hat, Inc.