[v2] 1G page support for guest_memfd

[RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ackerley Tng 8 months, 4 weeks ago

Track guest_memfd memory's shareability status within the inode as
opposed to the file, since it is property of the guest_memfd's memory
contents.

Shareability is a property of the memory and is indexed using the
page's index in the inode. Because shareability is the memory's
property, it is stored within guest_memfd instead of within KVM, like
in kvm->mem_attr_array.

KVM_MEMORY_ATTRIBUTE_PRIVATE in kvm->mem_attr_array must still be
retained to allow VMs to only use guest_memfd for private memory and
some other memory for shared memory.

Not all use cases require guest_memfd() to be shared with the host
when first created. Add a new flag, GUEST_MEMFD_FLAG_INIT_PRIVATE,
which when set on KVM_CREATE_GUEST_MEMFD, initializes the memory as
private to the guest, and therefore not mappable by the
host. Otherwise, memory is shared until explicitly converted to
private.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Vishal Annapurve <vannapurve@google.com>
Signed-off-by: Vishal Annapurve <vannapurve@google.com>
Co-developed-by: Fuad Tabba <tabba@google.com>
Signed-off-by: Fuad Tabba <tabba@google.com>
Change-Id: If03609cbab3ad1564685c85bdba6dcbb6b240c0f
---
 Documentation/virt/kvm/api.rst |   5 ++
 include/uapi/linux/kvm.h       |   2 +
 virt/kvm/guest_memfd.c         | 124 ++++++++++++++++++++++++++++++++-
 3 files changed, 129 insertions(+), 2 deletions(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 86f74ce7f12a..f609337ae1c2 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
 The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
 This is validated when the guest_memfd instance is bound to the VM.
 
+If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
+supports GUEST_MEMFD_FLAG_INIT_PRIVATE.  Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
+will initialize the memory for the guest_memfd as guest-only and not faultable
+by the host.
+
 See KVM_SET_USER_MEMORY_REGION2 for additional details.
 
 4.143 KVM_PRE_FAULT_MEMORY
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4cc824a3a7c9..d7df312479aa 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1567,7 +1567,9 @@ struct kvm_memory_attributes {
 #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
 
 #define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
+
 #define GUEST_MEMFD_FLAG_SUPPORT_SHARED	(1UL << 0)
+#define GUEST_MEMFD_FLAG_INIT_PRIVATE	(1UL << 1)
 
 struct kvm_create_guest_memfd {
 	__u64 size;
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 239d0f13dcc1..590932499eba 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
 #include <linux/falloc.h>
 #include <linux/fs.h>
 #include <linux/kvm_host.h>
+#include <linux/maple_tree.h>
 #include <linux/pseudo_fs.h>
 #include <linux/pagemap.h>
 
@@ -17,6 +18,24 @@ struct kvm_gmem {
 	struct list_head entry;
 };
 
+struct kvm_gmem_inode_private {
+#ifdef CONFIG_KVM_GMEM_SHARED_MEM
+	struct maple_tree shareability;
+#endif
+};
+
+enum shareability {
+	SHAREABILITY_GUEST = 1,	/* Only the guest can map (fault) folios in this range. */
+	SHAREABILITY_ALL = 2,	/* Both guest and host can fault folios in this range. */
+};
+
+static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
+
+static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
+{
+	return inode->i_mapping->i_private_data;
+}
+
 /**
  * folio_file_pfn - like folio_file_page, but return a pfn.
  * @folio: The folio which contains this index.
@@ -29,6 +48,58 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
 	return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
 }
 
+#ifdef CONFIG_KVM_GMEM_SHARED_MEM
+
+static int kvm_gmem_shareability_setup(struct kvm_gmem_inode_private *private,
+				      loff_t size, u64 flags)
+{
+	enum shareability m;
+	pgoff_t last;
+
+	last = (size >> PAGE_SHIFT) - 1;
+	m = flags & GUEST_MEMFD_FLAG_INIT_PRIVATE ? SHAREABILITY_GUEST :
+						    SHAREABILITY_ALL;
+	return mtree_store_range(&private->shareability, 0, last, xa_mk_value(m),
+				 GFP_KERNEL);
+}
+
+static enum shareability kvm_gmem_shareability_get(struct inode *inode,
+						 pgoff_t index)
+{
+	struct maple_tree *mt;
+	void *entry;
+
+	mt = &kvm_gmem_private(inode)->shareability;
+	entry = mtree_load(mt, index);
+	WARN(!entry,
+	     "Shareability should always be defined for all indices in inode.");
+
+	return xa_to_value(entry);
+}
+
+static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
+{
+	if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
+		return ERR_PTR(-EACCES);
+
+	return kvm_gmem_get_folio(inode, index);
+}
+
+#else
+
+static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
+{
+	return 0;
+}
+
+static inline struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
+{
+	WARN_ONCE("Unexpected call to get shared folio.")
+	return NULL;
+}
+
+#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
+
 static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
 				    pgoff_t index, struct folio *folio)
 {
@@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
 
 	filemap_invalidate_lock_shared(inode->i_mapping);
 
-	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
+	folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
 	if (IS_ERR(folio)) {
 		int err = PTR_ERR(folio);
 
@@ -420,8 +491,33 @@ static struct file_operations kvm_gmem_fops = {
 	.fallocate	= kvm_gmem_fallocate,
 };
 
+static void kvm_gmem_free_inode(struct inode *inode)
+{
+	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
+
+	kfree(private);
+
+	free_inode_nonrcu(inode);
+}
+
+static void kvm_gmem_destroy_inode(struct inode *inode)
+{
+	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
+
+#ifdef CONFIG_KVM_GMEM_SHARED_MEM
+	/*
+	 * mtree_destroy() can't be used within rcu callback, hence can't be
+	 * done in ->free_inode().
+	 */
+	if (private)
+		mtree_destroy(&private->shareability);
+#endif
+}
+
 static const struct super_operations kvm_gmem_super_operations = {
 	.statfs		= simple_statfs,
+	.destroy_inode	= kvm_gmem_destroy_inode,
+	.free_inode	= kvm_gmem_free_inode,
 };
 
 static int kvm_gmem_init_fs_context(struct fs_context *fc)
@@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
 static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
 						      loff_t size, u64 flags)
 {
+	struct kvm_gmem_inode_private *private;
 	struct inode *inode;
+	int err;
 
 	inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
 	if (IS_ERR(inode))
 		return inode;
 
+	err = -ENOMEM;
+	private = kzalloc(sizeof(*private), GFP_KERNEL);
+	if (!private)
+		goto out;
+
+	mt_init(&private->shareability);
+	inode->i_mapping->i_private_data = private;
+
+	err = kvm_gmem_shareability_setup(private, size, flags);
+	if (err)
+		goto out;
+
 	inode->i_private = (void *)(unsigned long)flags;
 	inode->i_op = &kvm_gmem_iops;
 	inode->i_mapping->a_ops = &kvm_gmem_aops;
@@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
 	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
 
 	return inode;
+
+out:
+	iput(inode);
+
+	return ERR_PTR(err);
 }
 
 static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
@@ -654,6 +769,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
 	if (kvm_arch_vm_supports_gmem_shared_mem(kvm))
 		valid_flags |= GUEST_MEMFD_FLAG_SUPPORT_SHARED;
 
+	if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED)
+		valid_flags |= GUEST_MEMFD_FLAG_INIT_PRIVATE;
+
 	if (flags & ~valid_flags)
 		return -EINVAL;
 
@@ -842,6 +960,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 	if (!file)
 		return -EFAULT;
 
+	filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
+
 	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
 	if (IS_ERR(folio)) {
 		r = PTR_ERR(folio);
@@ -857,8 +977,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
 		*page = folio_file_page(folio, index);
 	else
 		folio_put(folio);
-
 out:
+	filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
 	fput(file);
 	return r;
 }
-- 
2.49.0.1045.g170613ef41-goog

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Sean Christopherson 4 months, 1 week ago

Trimmed the Cc substantially as I doubt non-gmem/KVM folks will be excited about
thread necromancy.

On Wed, May 14, 2025, Ackerley Tng wrote:
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 86f74ce7f12a..f609337ae1c2 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
>  The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
>  This is validated when the guest_memfd instance is bound to the VM.
>  
> +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.  Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
> +will initialize the memory for the guest_memfd as guest-only and not faultable
> +by the host.

Whatever documentation we add should land at the same time as the collateral.
KVM_CAP_GMEM_CONVERSIONS literally doesn't exist at this time.

> @@ -17,6 +18,24 @@ struct kvm_gmem {
>  	struct list_head entry;
>  };
>  
> +struct kvm_gmem_inode_private {
> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> +	struct maple_tree shareability;
> +#endif
> +};
> +
> +enum shareability {
> +	SHAREABILITY_GUEST = 1,	/* Only the guest can map (fault) folios in this range. */
> +	SHAREABILITY_ALL = 2,	/* Both guest and host can fault folios in this range. */
> +};

Rather than define new values and new KVM uAPI, I think we should instead simply
support KVM_SET_MEMORY_ATTRIBUTES.  We'll probably need a new CAP, as I'm not sure
supporting KVM_CHECK_EXTENSION+KVM_CAP_MEMORY_ATTRIBUTES on a gmem fd would be a
good idea (e.g. trying to do KVM_CAP_GUEST_MEMFD_FLAGS on a gmem fd doesn't work
because the whole point is to get flags _before_ creating the gmem instance).  But
adding e.g. KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES is easy enough.

But for specifying PRIVATE vs. SHARED, I don't see any reason to define new uAPI.
I also don't want an entirely new set of terms in KVM to describe the same things.
PRIVATE and SHARED are far from perfect, but they're better than https://xkcd.com/927.
And if we ever want to let userspace restrict RWX protections in gmem, we'll have
a ready-made way to do so.  

Internally, that let's us do some fun things in KVM.  E.g. if we make the "disable
legacy per-VM memory attributes" a read-only module param, then we can wire up a
static_call() for kvm_get_memory_attributes() and then kvm_mem_is_private() will
Just Work.

  static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
  {
	return static_call(__kvm_get_memory_attributes)(kvm, gfn);
  }

  static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
  {
	return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
  }

That might trigger some additional surgery if/when we want to support RWX
protections on a per-VM basis _and_ a per-gmem basic, but I suspect such churn
would pale in comparison to the overall support needed for RWX protections.

The kvm_memory_attributes structure is compatible, all that's needed AFAICT is a
union to clarify it's a pgoff instead of an address when used for guest_memfd.

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 52f6000ab020..e0d8255ac8d2 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1590,7 +1590,10 @@ struct kvm_stats_desc {
 #define KVM_SET_MEMORY_ATTRIBUTES              _IOW(KVMIO,  0xd2, struct kvm_memory_attributes)

 struct kvm_memory_attributes {
-       __u64 address;
+       union {
+               __u64 address;
+               __u64 offset;
+       };
        __u64 size;
        __u64 attributes;
        __u64 flags;

> +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
> +
> +static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
> +{
> +	return inode->i_mapping->i_private_data;

This is a hilarious bad helper.  Everyone and their mother is going to think
about "private vs. shared" when they see kvm_gmem_private(), at least on the x86
side.

What's even more absurd is that the only "final" usage of the helper is to
free/destroy the inode:

  $ git grep kvm_gmem_private
  virt/kvm/guest_memfd.c:static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
  virt/kvm/guest_memfd.c: return kvm_gmem_private(inode)->allocator_ops;
  virt/kvm/guest_memfd.c: return kvm_gmem_private(inode)->allocator_private;
  virt/kvm/guest_memfd.c: mt = &kvm_gmem_private(inode)->shareability;
  virt/kvm/guest_memfd.c: mt = &kvm_gmem_private(inode)->shareability;
  virt/kvm/guest_memfd.c: mt = &kvm_gmem_private(inode)->shareability;
  virt/kvm/guest_memfd.c: mt = &kvm_gmem_private(inode)->shareability;
  virt/kvm/guest_memfd.c: struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
  virt/kvm/guest_memfd.c: struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);

And in that case, using a wrapper is counter-productive, just reference
inode->i_mapping->i_private_data directly so that readeres don't have to jump
through a useless layer.

Luckily, "struct kvm_gmem_inode_private" no longer needs to exist, now that
Shivank's NUMA policy series wraps the vfs_inode with a gmem_inode, and can be
retrieved via GMEM_I().  FWIW, before looking that series, I was going to suggest
something like to_gmem(), but I definitely think we should follow filesystems
convention, not KVM vCPU/VM convention.

>   * folio_file_pfn - like folio_file_page, but return a pfn.
>   * @folio: The folio which contains this index.
> @@ -29,6 +48,58 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
>  	return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
>  }
>  
> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> +
> +static int kvm_gmem_shareability_setup(struct kvm_gmem_inode_private *private,
> +				      loff_t size, u64 flags)
> +{
> +	enum shareability m;
> +	pgoff_t last;
> +
> +	last = (size >> PAGE_SHIFT) - 1;
> +	m = flags & GUEST_MEMFD_FLAG_INIT_PRIVATE ? SHAREABILITY_GUEST :
> +						    SHAREABILITY_ALL;
> +	return mtree_store_range(&private->shareability, 0, last, xa_mk_value(m),
> +				 GFP_KERNEL);
> +}
> +
> +static enum shareability kvm_gmem_shareability_get(struct inode *inode,
> +						 pgoff_t index)
> +{
> +	struct maple_tree *mt;
> +	void *entry;
> +
> +	mt = &kvm_gmem_private(inode)->shareability;
> +	entry = mtree_load(mt, index);
> +	WARN(!entry,

WARN_ON_ONCE(), otherwise we risk escalating a per-VM problem into a system-wide
DoS.

> +	     "Shareability should always be defined for all indices in inode.");
> +
> +	return xa_to_value(entry);
> +}
> +
> +static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> +{
> +	if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
> +		return ERR_PTR(-EACCES);
> +
> +	return kvm_gmem_get_folio(inode, index);

Please don't add 1-3 line helpers with one caller and very little hope of gaining
additional users, especially in guest_memfd where "shared" and "private" have
multiple meanings, and so things like "get_shared_folio" are inherently ambiguous.

I'm pretty sure a lot of this stems from CONFIG_KVM_GMEM_SHARED_MEM, which AFAICT
simply won't exist.  But just in case this is a Google3 pattern... 

> +}
> +
> +#else
> +
> +static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
> +{
> +	return 0;
> +}
> +
> +static inline struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> +{
> +	WARN_ONCE("Unexpected call to get shared folio.")
> +	return NULL;
> +}
> +
> +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
> +
>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>  				    pgoff_t index, struct folio *folio)
>  {
> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
>  
>  	filemap_invalidate_lock_shared(inode->i_mapping);
>  
> -	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> +	folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);

I am fairly certain there's a TOCTOU bug here.  AFAICT, nothing prevents the
underlying memory from being converted from shared=>private after checking that
the page is SHARED.

The locking rules for the maple_tree are also undocumented and haphazard.  I think
we can kill several birds with one stone by protecting the attributes with
invalidate_lock.  A bonus with using invalidate_lock is that it's a sleepable
lock, not a spinlock.  I don't think there's anything that would immediately
benefit?  But if we wanted to populate the tree on-demand (versus pre-filling
all possible pages), then it'd be easier to handle things like allocations in a
race free manner.

	/*
	 * Protect the attributes with the invalidation lock, which will always
	 * be held on conversions
	 */
	mt_init_flags(&gi->attributes, MT_FLAGS_LOCK_EXTERN);
	mt_set_external_lock(&gi->attributes,
			     &inode->i_mapping->invalidate_lock);

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Vishal Annapurve 4 months, 1 week ago

On Tue, Sep 30, 2025 at 4:40 PM Sean Christopherson <seanjc@google.com> wrote:
>
> > +};
> > +
> > +enum shareability {
> > +     SHAREABILITY_GUEST = 1, /* Only the guest can map (fault) folios in this range. */
> > +     SHAREABILITY_ALL = 2,   /* Both guest and host can fault folios in this range. */
> > +};
>
> Rather than define new values and new KVM uAPI, I think we should instead simply
> support KVM_SET_MEMORY_ATTRIBUTES.  We'll probably need a new CAP, as I'm not sure
> supporting KVM_CHECK_EXTENSION+KVM_CAP_MEMORY_ATTRIBUTES on a gmem fd would be a
> good idea (e.g. trying to do KVM_CAP_GUEST_MEMFD_FLAGS on a gmem fd doesn't work
> because the whole point is to get flags _before_ creating the gmem instance).  But
> adding e.g. KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES is easy enough.
>
> But for specifying PRIVATE vs. SHARED, I don't see any reason to define new uAPI.
> I also don't want an entirely new set of terms in KVM to describe the same things.
> PRIVATE and SHARED are far from perfect, but they're better than https://xkcd.com/927.
> And if we ever want to let userspace restrict RWX protections in gmem, we'll have
> a ready-made way to do so.
>

I don't understand why we need to reuse KVM_SET_MEMORY_ATTRIBUTES. It
anyways is a new ABI as it's on a guest_memfd FD instead of KVM FD.

RWX protections seem to be pagetable configuration rather than
guest_memfd properties. Can mmap flags + kvm userfaultfd help enforce
RWX protections?

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Sean Christopherson 4 months, 1 week ago

On Wed, Oct 01, 2025, Vishal Annapurve wrote:
> On Tue, Sep 30, 2025 at 4:40 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > > +};
> > > +
> > > +enum shareability {
> > > +     SHAREABILITY_GUEST = 1, /* Only the guest can map (fault) folios in this range. */
> > > +     SHAREABILITY_ALL = 2,   /* Both guest and host can fault folios in this range. */
> > > +};
> >
> > Rather than define new values and new KVM uAPI, I think we should instead simply
> > support KVM_SET_MEMORY_ATTRIBUTES.  We'll probably need a new CAP, as I'm not sure
> > supporting KVM_CHECK_EXTENSION+KVM_CAP_MEMORY_ATTRIBUTES on a gmem fd would be a
> > good idea (e.g. trying to do KVM_CAP_GUEST_MEMFD_FLAGS on a gmem fd doesn't work
> > because the whole point is to get flags _before_ creating the gmem instance).  But
> > adding e.g. KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES is easy enough.
> >
> > But for specifying PRIVATE vs. SHARED, I don't see any reason to define new uAPI.
> > I also don't want an entirely new set of terms in KVM to describe the same things.
> > PRIVATE and SHARED are far from perfect, but they're better than https://xkcd.com/927.
> > And if we ever want to let userspace restrict RWX protections in gmem, we'll have
> > a ready-made way to do so.
> >
> 
> I don't understand why we need to reuse KVM_SET_MEMORY_ATTRIBUTES. It
> anyways is a new ABI as it's on a guest_memfd FD instead of KVM FD.

Yes, it's new functionality, but the semantics are the same (modulo s/address/offset),
which makes life easier for KVM and its developers.  Specifically I want to avoid
ending up with two entirely different ways for describing private vs. shared memory.
E.g. I don't want to have to translate between SHAREABILITY_GUEST and PRIVATE,
in code or in conversation.

> RWX protections seem to be pagetable configuration rather than
> guest_memfd properties. Can mmap flags + kvm userfaultfd help enforce
> RWX protections?

No, because mmap() is optional.  Potential use cases are for (seletively)
restricting _guest_ access as well as host access.  mmap() isn't a good fit
regardless, as that's much more about describing what the process wants, not
the properties of the underlying memory.

E.g. read-only and noexec file systems exist for a reason.

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ackerley Tng 4 months, 1 week ago

Sean Christopherson <seanjc@google.com> writes:

> Trimmed the Cc substantially as I doubt non-gmem/KVM folks will be excited about
> thread necromancy.
>
> On Wed, May 14, 2025, Ackerley Tng wrote:
>> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
>> index 86f74ce7f12a..f609337ae1c2 100644
>> --- a/Documentation/virt/kvm/api.rst
>> +++ b/Documentation/virt/kvm/api.rst
>> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
>>  The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
>>  This is validated when the guest_memfd instance is bound to the VM.
>>  
>> +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
>> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.  Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
>> +will initialize the memory for the guest_memfd as guest-only and not faultable
>> +by the host.
>
> Whatever documentation we add should land at the same time as the collateral.
> KVM_CAP_GMEM_CONVERSIONS literally doesn't exist at this time.
>

Thanks, will keep this in mind for next time.

>> @@ -17,6 +18,24 @@ struct kvm_gmem {
>>  	struct list_head entry;
>>  };
>>  
>> +struct kvm_gmem_inode_private {
>> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
>> +	struct maple_tree shareability;
>> +#endif
>> +};
>> +
>> +enum shareability {
>> +	SHAREABILITY_GUEST = 1,	/* Only the guest can map (fault) folios in this range. */
>> +	SHAREABILITY_ALL = 2,	/* Both guest and host can fault folios in this range. */
>> +};
>
> Rather than define new values and new KVM uAPI, I think we should instead simply
> support KVM_SET_MEMORY_ATTRIBUTES.  We'll probably need a new CAP, as I'm not sure
> supporting KVM_CHECK_EXTENSION+KVM_CAP_MEMORY_ATTRIBUTES on a gmem fd would be a
> good idea (e.g. trying to do KVM_CAP_GUEST_MEMFD_FLAGS on a gmem fd doesn't work
> because the whole point is to get flags _before_ creating the gmem instance).  But
> adding e.g. KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES is easy enough.
>

I've read this a few times and I'm a bit confused, so just making sure:
you are suggesting that we reuse the KVM_SET_MEMORY_ATTRIBUTES ioctl as
a guest_memfd (not a VM) ioctl and still store private/shared state
within guest_memfd, right?

I think fundamentally the introduction of the guest_memfd ioctls was
motivated by how private/shared state is a property of memory and not a
property of the VM. (IIRC you were the one to most succinctly phrase it
this way on one of the guest_memfd biweeklies.) So I hope you don't mean
to revert to doing conversions through a VM ioctl.

> But for specifying PRIVATE vs. SHARED, I don't see any reason to define new uAPI.
> I also don't want an entirely new set of terms in KVM to describe the same things.
> PRIVATE and SHARED are far from perfect, but they're better than https://xkcd.com/927.
> And if we ever want to let userspace restrict RWX protections in gmem, we'll have
> a ready-made way to do so.  
>

Would like to understand more about RWX protections: is the use case to
let userspace specify that certain ranges of guest memory are to be
mapped into stage 2 page tables without executable permissions?

Is there a further use case to let the guest specify that userspace must
not mmap() some ranges as executable?

For guest_memfd the userspace mapping permissions are already defined
by userspace and so unless guest_memfd must enforce something on behalf
of the guest, there shouldn't be anything more that guest_memfd should
track with respect to RWX permissions.

> Internally, that let's us do some fun things in KVM.  E.g. if we make the "disable
> legacy per-VM memory attributes" a read-only module param, then we can wire up a
> static_call() for kvm_get_memory_attributes() and then kvm_mem_is_private() will
> Just Work.
>
>   static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
>   {
> 	return static_call(__kvm_get_memory_attributes)(kvm, gfn);
>   }
>
>   static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
>   {
> 	return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
>   }
>
> That might trigger some additional surgery if/when we want to support RWX
> protections on a per-VM basis _and_ a per-gmem basic, but I suspect such churn
> would pale in comparison to the overall support needed for RWX protections.
>

RWX protections are more of a VM-level property, if I understood the use
case correctly that some gfn ranges are to be marked non-executable by
userspace. Setting RWX within guest_memfd would be kind of awkward since
userspace must first translate GFN to offset, then set it using the
offset within guest_memfd. Hence I think it's okay to have RWX stuff go
through the regular KVM_SET_MEMORY_ATTRIBUTES *VM* ioctl and have it
tracked in mem_attr_array.

I'd prefer not to have the module param choose between the use of
mem_attr_array and guest_memfd conversion in case we need both
mem_attr_array to support other stuff in future while supporting
conversions.

> The kvm_memory_attributes structure is compatible, all that's needed AFAICT is a
> union to clarify it's a pgoff instead of an address when used for guest_memfd.
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 52f6000ab020..e0d8255ac8d2 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1590,7 +1590,10 @@ struct kvm_stats_desc {
>  #define KVM_SET_MEMORY_ATTRIBUTES              _IOW(KVMIO,  0xd2, struct kvm_memory_attributes)
>  
>  struct kvm_memory_attributes {
> -       __u64 address;
> +       union {
> +               __u64 address;
> +               __u64 offset;
> +       };
>         __u64 size;
>         __u64 attributes;
>         __u64 flags;
>

struct kvm_memory_attributes doesn't have room for reporting the offset
at which conversion failed (error_offset in the new struct). How do we
handle this? Do we reuse the flags field, or do we not report
error_offset?

>> +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
>> +
>> +static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
>> +{
>> +	return inode->i_mapping->i_private_data;
>
> This is a hilarious bad helper.  Everyone and their mother is going to think
> about "private vs. shared" when they see kvm_gmem_private(), at least on the x86
> side.
>

Totally missed this interpretation of private, lol. Too many
interpretations of private: MAP_PRIVATE, CoCo's private vs shared, and
i_private_data.

> What's even more absurd is that the only "final" usage of the helper is to
> free/destroy the inode:
>
>   $ git grep kvm_gmem_private
>   virt/kvm/guest_memfd.c:static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
>   virt/kvm/guest_memfd.c: return kvm_gmem_private(inode)->allocator_ops;
>   virt/kvm/guest_memfd.c: return kvm_gmem_private(inode)->allocator_private;
>   virt/kvm/guest_memfd.c: mt = &kvm_gmem_private(inode)->shareability;
>   virt/kvm/guest_memfd.c: mt = &kvm_gmem_private(inode)->shareability;
>   virt/kvm/guest_memfd.c: mt = &kvm_gmem_private(inode)->shareability;
>   virt/kvm/guest_memfd.c: mt = &kvm_gmem_private(inode)->shareability;
>   virt/kvm/guest_memfd.c: struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
>   virt/kvm/guest_memfd.c: struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
>
> And in that case, using a wrapper is counter-productive, just reference
> inode->i_mapping->i_private_data directly so that readeres don't have to jump
> through a useless layer.
>
> Luckily, "struct kvm_gmem_inode_private" no longer needs to exist, now that
> Shivank's NUMA policy series wraps the vfs_inode with a gmem_inode, and can be
> retrieved via GMEM_I().  FWIW, before looking that series, I was going to suggest
> something like to_gmem(), but I definitely think we should follow filesystems
> convention, not KVM vCPU/VM convention.
>

I'll align with the wrapper struct to align with filesystems conventions then.

>>   * folio_file_pfn - like folio_file_page, but return a pfn.
>>   * @folio: The folio which contains this index.
>> @@ -29,6 +48,58 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
>>  	return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
>>  }
>>  
>> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
>> +
>> +static int kvm_gmem_shareability_setup(struct kvm_gmem_inode_private *private,
>> +				      loff_t size, u64 flags)
>> +{
>> +	enum shareability m;
>> +	pgoff_t last;
>> +
>> +	last = (size >> PAGE_SHIFT) - 1;
>> +	m = flags & GUEST_MEMFD_FLAG_INIT_PRIVATE ? SHAREABILITY_GUEST :
>> +						    SHAREABILITY_ALL;
>> +	return mtree_store_range(&private->shareability, 0, last, xa_mk_value(m),
>> +				 GFP_KERNEL);
>> +}
>> +
>> +static enum shareability kvm_gmem_shareability_get(struct inode *inode,
>> +						 pgoff_t index)
>> +{
>> +	struct maple_tree *mt;
>> +	void *entry;
>> +
>> +	mt = &kvm_gmem_private(inode)->shareability;
>> +	entry = mtree_load(mt, index);
>> +	WARN(!entry,
>
> WARN_ON_ONCE(), otherwise we risk escalating a per-VM problem into a system-wide
> DoS.
>

Will take note for next time.

>> +	     "Shareability should always be defined for all indices in inode.");
>> +
>> +	return xa_to_value(entry);
>> +}
>> +
>> +static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
>> +{
>> +	if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
>> +		return ERR_PTR(-EACCES);
>> +
>> +	return kvm_gmem_get_folio(inode, index);
>
> Please don't add 1-3 line helpers with one caller and very little hope of gaining
> additional users, especially in guest_memfd where "shared" and "private" have
> multiple meanings, and so things like "get_shared_folio" are inherently ambiguous.
>
> I'm pretty sure a lot of this stems from CONFIG_KVM_GMEM_SHARED_MEM, which AFAICT
> simply won't exist.  But just in case this is a Google3 pattern... 
>

Will take note for next time.

>> +}
>> +
>> +#else
>> +
>> +static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
>> +{
>> +	WARN_ONCE("Unexpected call to get shared folio.")
>> +	return NULL;
>> +}
>> +
>> +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
>> +
>>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>>  				    pgoff_t index, struct folio *folio)
>>  {
>> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
>>  
>>  	filemap_invalidate_lock_shared(inode->i_mapping);
>>  
>> -	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
>> +	folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
>
> I am fairly certain there's a TOCTOU bug here.  AFAICT, nothing prevents the
> underlying memory from being converted from shared=>private after checking that
> the page is SHARED.
>

Conversions take the filemap_invalidate_lock() too, along with
allocations, truncations.

Because the filemap_invalidate_lock() might be reused for other
fs-specific operations, I didn't do the mt_set_external_lock() thing to
lock at a low level to avoid nested locking or special maple tree code
to avoid taking the lock on other paths.

> The locking rules for the maple_tree are also undocumented and haphazard.  I think
> we can kill several birds with one stone by protecting the attributes with
> invalidate_lock.  A bonus with using invalidate_lock is that it's a sleepable
> lock, not a spinlock.  I don't think there's anything that would immediately
> benefit?  But if we wanted to populate the tree on-demand (versus pre-filling
> all possible pages), then it'd be easier to handle things like allocations in a
> race free manner.
>
> 	/*
> 	 * Protect the attributes with the invalidation lock, which will always
> 	 * be held on conversions
> 	 */
> 	mt_init_flags(&gi->attributes, MT_FLAGS_LOCK_EXTERN);
> 	mt_set_external_lock(&gi->attributes,
> 			     &inode->i_mapping->invalidate_lock);

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ira Weiny 4 months, 1 week ago

Ackerley Tng wrote:
> Sean Christopherson <seanjc@google.com> writes:
> 

[snip]

> 
> > Internally, that let's us do some fun things in KVM.  E.g. if we make the "disable
> > legacy per-VM memory attributes" a read-only module param, then we can wire up a
> > static_call() for kvm_get_memory_attributes() and then kvm_mem_is_private() will
> > Just Work.
> >
> >   static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
> >   {
> > 	return static_call(__kvm_get_memory_attributes)(kvm, gfn);
> >   }
> >
> >   static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> >   {
> > 	return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> >   }
> >
> > That might trigger some additional surgery if/when we want to support RWX
> > protections on a per-VM basis _and_ a per-gmem basic, but I suspect such churn
> > would pale in comparison to the overall support needed for RWX protections.
> >
> 
> RWX protections are more of a VM-level property, if I understood the use
> case correctly that some gfn ranges are to be marked non-executable by
> userspace. Setting RWX within guest_memfd would be kind of awkward since
> userspace must first translate GFN to offset, then set it using the
> offset within guest_memfd. Hence I think it's okay to have RWX stuff go
> through the regular KVM_SET_MEMORY_ATTRIBUTES *VM* ioctl and have it
> tracked in mem_attr_array.
> 
> I'd prefer not to have the module param choose between the use of
> mem_attr_array and guest_memfd conversion in case we need both
> mem_attr_array to support other stuff in future while supporting
> conversions.

I'm getting pretty confused on how userspace is going to know which ioctl
to use VM vs gmem.

I was starting to question if going through the VM ioctl should actually
change the guest_memfd flags (shareability).

In a prototype I'm playing with shareability has become a bit field which
I think aligns with the idea of expanding the memory attributes.  But I've
had some issues with the TDX tests in trying to decipher when to call
vm_set_memory_attributes() vs guest_memfd_convert_private().

> 
> > The kvm_memory_attributes structure is compatible, all that's needed AFAICT is a
> > union to clarify it's a pgoff instead of an address when used for guest_memfd.
> >
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index 52f6000ab020..e0d8255ac8d2 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -1590,7 +1590,10 @@ struct kvm_stats_desc {
> >  #define KVM_SET_MEMORY_ATTRIBUTES              _IOW(KVMIO,  0xd2, struct kvm_memory_attributes)
> >  
> >  struct kvm_memory_attributes {
> > -       __u64 address;
> > +       union {
> > +               __u64 address;
> > +               __u64 offset;
> > +       };
> >         __u64 size;
> >         __u64 attributes;
> >         __u64 flags;
> >
> 
> struct kvm_memory_attributes doesn't have room for reporting the offset
> at which conversion failed (error_offset in the new struct). How do we
> handle this? Do we reuse the flags field, or do we not report
> error_offset?

We could extend this for gmem's version of the struct.

> 
> >> +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
> >> +
> >> +static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
> >> +{
> >> +	return inode->i_mapping->i_private_data;
> >
> > This is a hilarious bad helper.  Everyone and their mother is going to think
> > about "private vs. shared" when they see kvm_gmem_private(), at least on the x86
> > side.
> >
> 
> Totally missed this interpretation of private, lol. Too many
> interpretations of private: MAP_PRIVATE, CoCo's private vs shared, and
> i_private_data.
> 

FWIW this did not confuse me and it probably should have...  ;-)  I'm fine
with Sean's suggestion though.

> >> +}
> >> +
> >> +#else
> >> +
> >> +static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
> >> +{
> >> +	return 0;
> >> +}
> >> +
> >> +static inline struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> >> +{
> >> +	WARN_ONCE("Unexpected call to get shared folio.")
> >> +	return NULL;
> >> +}
> >> +
> >> +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
> >> +
> >>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> >>  				    pgoff_t index, struct folio *folio)
> >>  {
> >> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
> >>  
> >>  	filemap_invalidate_lock_shared(inode->i_mapping);
> >>  
> >> -	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> >> +	folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
> >
> > I am fairly certain there's a TOCTOU bug here.  AFAICT, nothing prevents the
> > underlying memory from being converted from shared=>private after checking that
> > the page is SHARED.
> >
> 
> Conversions take the filemap_invalidate_lock() too, along with
> allocations, truncations.
> 
> Because the filemap_invalidate_lock() might be reused for other
> fs-specific operations, I didn't do the mt_set_external_lock() thing to
> lock at a low level to avoid nested locking or special maple tree code
> to avoid taking the lock on other paths.

I don't think using the filemap_invalidate_lock() is going to work well
here.  I've had some hangs on it in my testing and experiments.  I think
it is better to specifically lock the state tracking itself.  I believe
Michael mentioned this as well in a previous thread.

Ira

[snip]

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ackerley Tng 4 months, 1 week ago

Ira Weiny <ira.weiny@intel.com> writes:

> Ackerley Tng wrote:
>> Sean Christopherson <seanjc@google.com> writes:
>> 
>
> [snip]
>
>> 
>> > Internally, that let's us do some fun things in KVM.  E.g. if we make the "disable
>> > legacy per-VM memory attributes" a read-only module param, then we can wire up a
>> > static_call() for kvm_get_memory_attributes() and then kvm_mem_is_private() will
>> > Just Work.
>> >
>> >   static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
>> >   {
>> > 	return static_call(__kvm_get_memory_attributes)(kvm, gfn);
>> >   }
>> >
>> >   static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
>> >   {
>> > 	return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
>> >   }
>> >
>> > That might trigger some additional surgery if/when we want to support RWX
>> > protections on a per-VM basis _and_ a per-gmem basic, but I suspect such churn
>> > would pale in comparison to the overall support needed for RWX protections.
>> >
>> 
>> RWX protections are more of a VM-level property, if I understood the use
>> case correctly that some gfn ranges are to be marked non-executable by
>> userspace. Setting RWX within guest_memfd would be kind of awkward since
>> userspace must first translate GFN to offset, then set it using the
>> offset within guest_memfd. Hence I think it's okay to have RWX stuff go
>> through the regular KVM_SET_MEMORY_ATTRIBUTES *VM* ioctl and have it
>> tracked in mem_attr_array.
>> 
>> I'd prefer not to have the module param choose between the use of
>> mem_attr_array and guest_memfd conversion in case we need both
>> mem_attr_array to support other stuff in future while supporting
>> conversions.
>
> I'm getting pretty confused on how userspace is going to know which ioctl
> to use VM vs gmem.
>

It is confusing, yes!

> I was starting to question if going through the VM ioctl should actually
> change the guest_memfd flags (shareability).
>

At one of the guest_memfd biweeklies, we came to the conclusion that we
should have a per-VM KVM_CAP_DISABLE_LEGACY_PRIVATE_TRACKING, which will
disable the use of just KVM_MEMORY_ATTRIBUTE_PRIVATE for the
KVM_SET_MEMORY_ATTRIBUTES ioctl, and
KVM_CAP_DISABLE_LEGACY_PRIVATE_TRACKING is the only way to enable
conversions for a guest_memfd with mmap() support.

IOW, KVM_CAP_DISABLE_LEGACY_PRIVATE_TRACKING makes userspace choose
either

+ Using guest_memfd for private memory and some other memory
  (e.g. anonymous memory) for shared memory (aka legacy dual backing)
    + And using KVM_SET_MEMORY_ATTRIBUTES VM ioctl for conversions
    + And using mem_attr_array to track shared/private status

+ Using guest_memfd for both private and shared memory (aka single backing)
    + And using the guest_memfd ioctl for conversions
    + And using guest_memfd shareability to track shared/private status
    
Since userspace has to choose one of the above, there's no point in the
VM ioctl affecting shareability.

Sean's suggestion of a module param moves this choice from VM-level to
KVM/host-level.

> In a prototype I'm playing with shareability has become a bit field which
> I think aligns with the idea of expanding the memory attributes.

I guess this is tangentially related and could do with some profiling,
but we should be careful about adding too many states in the maple tree.

Conversion iterates over offset ranges in the maple tree, and iteration
is faster if there are fewer nodes in the maple tree.

If we just have two states (shared/private) in the maple tree, each node
is either all private or all shared.

If we have more states, private ranges might get fragmented based on the
orthogonal bits, e.g. RWX, which could then impact conversion
performance.

> But I've
> had some issues with the TDX tests in trying to decipher when to call
> vm_set_memory_attributes() vs guest_memfd_convert_private().
>

Hope the above explanation helps!

+ Legacy dual backing: vm_set_memory_attributes()
+ Single backing: guest_memfd_convert_private()

I don't think this will change much even with the module param, since
userspace will still need to know whether to pass in a vm fd or a
guest_memfd fd.

Or maybe vm_set_memory_attributes() can take a vm fd, then query module
params and then figure out if it should pass vm or guest_mefd fds?

>> 
>> [...snip...]
>> 
>> >>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>> >>  				    pgoff_t index, struct folio *folio)
>> >>  {
>> >> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
>> >>  
>> >>  	filemap_invalidate_lock_shared(inode->i_mapping);
>> >>  
>> >> -	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
>> >> +	folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
>> >
>> > I am fairly certain there's a TOCTOU bug here.  AFAICT, nothing prevents the
>> > underlying memory from being converted from shared=>private after checking that
>> > the page is SHARED.
>> >
>> 
>> Conversions take the filemap_invalidate_lock() too, along with
>> allocations, truncations.
>> 
>> Because the filemap_invalidate_lock() might be reused for other
>> fs-specific operations, I didn't do the mt_set_external_lock() thing to
>> lock at a low level to avoid nested locking or special maple tree code
>> to avoid taking the lock on other paths.
>
> I don't think using the filemap_invalidate_lock() is going to work well
> here.  I've had some hangs on it in my testing and experiments.  I think
> it is better to specifically lock the state tracking itself.  I believe
> Michael mentioned this as well in a previous thread.

Definitely took the big hammer lock for a start and might be optimizable.

Considerations so far: when a conversion is happening, these have to be
locked out:

+ Conversions from competing threads
+ Allocations in kvm_gmem_fault_user_mapping(), because whether an
  offset can be faulted depends on the outcome of conversion
+ Allocations (fallocate() or kvm_gmem_get_pfn()) and truncations,
  because conversions (for now) involves removing a folio from the
  filemap, restructuring, then restoring to the filemap, and
    + Allocations should reuse a folio that was already in the filemap
    + Truncations remove a folio, and should not skip removal of a folio
      because it was taken out just for conversion
+ memory failure handling, where we don't remove folios from the
  filemap, but we might restructure, to split huge folios to just unmap
  pages with failed memory

I think essentially because conversion involves restructuring, and
restructuring involves filemap operations and other filemap operations
have to wait, conversion also takes the filemap_invalidate_lock() that
filemap operations use.

>
> Ira
>
> [snip]

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ira Weiny 4 months, 1 week ago

Ackerley Tng wrote:
> Ira Weiny <ira.weiny@intel.com> writes:
> 
> > Ackerley Tng wrote:
> >> Sean Christopherson <seanjc@google.com> writes:
> >> 
> >
> > [snip]
> >

[snip]

> >> 
> >> I'd prefer not to have the module param choose between the use of
> >> mem_attr_array and guest_memfd conversion in case we need both
> >> mem_attr_array to support other stuff in future while supporting
> >> conversions.
> >
> > I'm getting pretty confused on how userspace is going to know which ioctl
> > to use VM vs gmem.
> >
> 
> It is confusing, yes!
> 
> > I was starting to question if going through the VM ioctl should actually
> > change the guest_memfd flags (shareability).
> >
> 
> At one of the guest_memfd biweeklies, we came to the conclusion that we
> should have a per-VM KVM_CAP_DISABLE_LEGACY_PRIVATE_TRACKING, which will
> disable the use of just KVM_MEMORY_ATTRIBUTE_PRIVATE for the
> KVM_SET_MEMORY_ATTRIBUTES ioctl, and
> KVM_CAP_DISABLE_LEGACY_PRIVATE_TRACKING is the only way to enable
> conversions for a guest_memfd with mmap() support.
> 
> IOW, KVM_CAP_DISABLE_LEGACY_PRIVATE_TRACKING makes userspace choose
> either
> 
> + Using guest_memfd for private memory and some other memory
>   (e.g. anonymous memory) for shared memory (aka legacy dual backing)
>     + And using KVM_SET_MEMORY_ATTRIBUTES VM ioctl for conversions
>     + And using mem_attr_array to track shared/private status
> 
> + Using guest_memfd for both private and shared memory (aka single backing)
>     + And using the guest_memfd ioctl for conversions
>     + And using guest_memfd shareability to track shared/private status
>     
> Since userspace has to choose one of the above, there's no point in the
> VM ioctl affecting shareability.
> 
> Sean's suggestion of a module param moves this choice from VM-level to
> KVM/host-level.

Ok I remember this discussion but I was not clear on the mechanics.  This
helps clarify things thanks!

> 
> > In a prototype I'm playing with shareability has become a bit field which
> > I think aligns with the idea of expanding the memory attributes.
> 
> I guess this is tangentially related and could do with some profiling,
> but we should be careful about adding too many states in the maple tree.
> 
> Conversion iterates over offset ranges in the maple tree, and iteration
> is faster if there are fewer nodes in the maple tree.
> 
> If we just have two states (shared/private) in the maple tree, each node
> is either all private or all shared.
> 
> If we have more states, private ranges might get fragmented based on the
> orthogonal bits, e.g. RWX, which could then impact conversion
> performance.

I'm thinking along these same lines yea.

> 
> > But I've
> > had some issues with the TDX tests in trying to decipher when to call
> > vm_set_memory_attributes() vs guest_memfd_convert_private().
> >
> 
> Hope the above explanation helps!
> 
> + Legacy dual backing: vm_set_memory_attributes()
> + Single backing: guest_memfd_convert_private()
> 
> I don't think this will change much even with the module param, since
> userspace will still need to know whether to pass in a vm fd or a
> guest_memfd fd.
> 
> Or maybe vm_set_memory_attributes() can take a vm fd, then query module
> params and then figure out if it should pass vm or guest_mefd fds?

For the tests that might help for sure.

Generally I'm hesitant to introduce module parameters as they are pretty
big hammers to change behavior.  But if it makes things easier and is
acceptable then I'm not going to complain...

> 
> >> 
> >> [...snip...]
> >> 
> >> >>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> >> >>  				    pgoff_t index, struct folio *folio)
> >> >>  {
> >> >> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
> >> >>  
> >> >>  	filemap_invalidate_lock_shared(inode->i_mapping);
> >> >>  
> >> >> -	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> >> >> +	folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
> >> >
> >> > I am fairly certain there's a TOCTOU bug here.  AFAICT, nothing prevents the
> >> > underlying memory from being converted from shared=>private after checking that
> >> > the page is SHARED.
> >> >
> >> 
> >> Conversions take the filemap_invalidate_lock() too, along with
> >> allocations, truncations.
> >> 
> >> Because the filemap_invalidate_lock() might be reused for other
> >> fs-specific operations, I didn't do the mt_set_external_lock() thing to
> >> lock at a low level to avoid nested locking or special maple tree code
> >> to avoid taking the lock on other paths.
> >
> > I don't think using the filemap_invalidate_lock() is going to work well
> > here.  I've had some hangs on it in my testing and experiments.  I think
> > it is better to specifically lock the state tracking itself.  I believe
> > Michael mentioned this as well in a previous thread.
> 
> Definitely took the big hammer lock for a start and might be optimizable.
> 
> Considerations so far: when a conversion is happening, these have to be
> locked out:
> 
> + Conversions from competing threads

Agreed.  And this needs filemap_invalidate_lock() as well as the maple
tree lock.

Call this item 1.

> + Allocations in kvm_gmem_fault_user_mapping(), because whether an
>   offset can be faulted depends on the outcome of conversion

Agreed.  And this needs filemap_invalidate_lock() as well as the maple
tree lock.

Call this item 2.

> + Allocations (fallocate() or kvm_gmem_get_pfn()) and truncations,
>   because conversions (for now) involves removing a folio from the
>   filemap, restructuring, then restoring to the filemap, and
>     + Allocations should reuse a folio that was already in the filemap
>     + Truncations remove a folio, and should not skip removal of a folio
>       because it was taken out just for conversion

I don't think this is required...

> + memory failure handling, where we don't remove folios from the
>   filemap, but we might restructure, to split huge folios to just unmap
>   pages with failed memory

... nor this.  These don't change the sharability maple tree.

These operations don't change or need to know the shareability AFAICT.

Merging a folio would have to check the maple tree to ensure we don't
merge incompatible folios...  But that is a read check and should be easy
to add.

> I think essentially because conversion involves restructuring, and
> restructuring involves filemap operations and other filemap operations
> have to wait, conversion also takes the filemap_invalidate_lock() that
> filemap operations use.

I could be convinced otherwise but I'm thinking the overhead of another
lock for the sake of simplicity is a good trade off.  I don't think any of
the conversions are a fast path operation are they?

Ira

[snip]

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ackerley Tng 3 months, 3 weeks ago

Ira Weiny <ira.weiny@intel.com> writes:

> 
> [...snip...]
> 
>> >> 
>> >> Conversions take the filemap_invalidate_lock() too, along with
>> >> allocations, truncations.
>> >> 
>> >> Because the filemap_invalidate_lock() might be reused for other
>> >> fs-specific operations, I didn't do the mt_set_external_lock() thing to
>> >> lock at a low level to avoid nested locking or special maple tree code
>> >> to avoid taking the lock on other paths.
>> >
>> > I don't think using the filemap_invalidate_lock() is going to work well
>> > here.  I've had some hangs on it in my testing and experiments.  I think
>> > it is better to specifically lock the state tracking itself.  I believe
>> > Michael mentioned this as well in a previous thread.
>> 
>> Definitely took the big hammer lock for a start and might be optimizable.
>> 
>> Considerations so far: when a conversion is happening, these have to be
>> locked out:
>> 
>> + Conversions from competing threads
>
> Agreed.  And this needs filemap_invalidate_lock() as well as the maple
> tree lock.
>
> Call this item 1.
>
>> + Allocations in kvm_gmem_fault_user_mapping(), because whether an
>>   offset can be faulted depends on the outcome of conversion
>
> Agreed.  And this needs filemap_invalidate_lock() as well as the maple
> tree lock.
>
> Call this item 2.
>
>> + Allocations (fallocate() or kvm_gmem_get_pfn()) and truncations,
>>   because conversions (for now) involves removing a folio from the
>>   filemap, restructuring, then restoring to the filemap, and
>>     + Allocations should reuse a folio that was already in the filemap
>>     + Truncations remove a folio, and should not skip removal of a folio
>>       because it was taken out just for conversion
>
> I don't think this is required...
>
>> + memory failure handling, where we don't remove folios from the
>>   filemap, but we might restructure, to split huge folios to just unmap
>>   pages with failed memory
>
> ... nor this.  These don't change the sharability maple tree.
>
> These operations don't change or need to know the shareability AFAICT.
>
> Merging a folio would have to check the maple tree to ensure we don't
> merge incompatible folios...  But that is a read check and should be easy
> to add.
>
>> I think essentially because conversion involves restructuring, and
>> restructuring involves filemap operations and other filemap operations
>> have to wait, conversion also takes the filemap_invalidate_lock() that
>> filemap operations use.
>
> I could be convinced otherwise but I'm thinking the overhead of another
> lock for the sake of simplicity is a good trade off.  I don't think any of
> the conversions are a fast path operation are they?

Haha, I think not having another lock is simpler! Looks like it's
starting to get subjective.

For the next RFC, I'll go with re-using the filemap_invalidate_lock(),
and the next RFC will have quite some changes too. Please feel free to
bring this up again. The next RFC is an RFC and won't be committal
anyway :)

>
> Ira
>
> [snip]

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Sean Christopherson 4 months, 1 week ago

On Wed, Oct 01, 2025, Ackerley Tng wrote:
> Sean Christopherson <seanjc@google.com> writes:
> > On Wed, May 14, 2025, Ackerley Tng wrote:
> >> +enum shareability {
> >> +	SHAREABILITY_GUEST = 1,	/* Only the guest can map (fault) folios in this range. */
> >> +	SHAREABILITY_ALL = 2,	/* Both guest and host can fault folios in this range. */
> >> +};
> >
> > Rather than define new values and new KVM uAPI, I think we should instead simply
> > support KVM_SET_MEMORY_ATTRIBUTES.  We'll probably need a new CAP, as I'm not sure
> > supporting KVM_CHECK_EXTENSION+KVM_CAP_MEMORY_ATTRIBUTES on a gmem fd would be a
> > good idea (e.g. trying to do KVM_CAP_GUEST_MEMFD_FLAGS on a gmem fd doesn't work
> > because the whole point is to get flags _before_ creating the gmem instance).  But
> > adding e.g. KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES is easy enough.
> >
> 
> I've read this a few times and I'm a bit confused, so just making sure:
> you are suggesting that we reuse the KVM_SET_MEMORY_ATTRIBUTES ioctl as
> a guest_memfd (not a VM) ioctl and still store private/shared state
> within guest_memfd, right?

Yep.  Something like:

static long kvm_gmem_set_attributes(struct file *file, void __user *argp)
{
	struct gmem_file *f = file->private_data;
	struct inode *inode = file_inode(file);
	struct kvm_memory_attributes attrs;
	pgoff_t err_index;
	int r;

	if (copy_from_user(&attrs, argp, sizeof(attrs)))
		return -EFAULT;

	if (attrs.flags)
		return -EINVAL;
	if (attrs.attributes & ~kvm_gmem_supported_mem_attributes(f))
		return -EINVAL;
	if (attrs.size == 0 || attrs.offset + attrs.size < attrs.offset)
		return -EINVAL;
	if (!PAGE_ALIGNED(attrs.offset) || !PAGE_ALIGNED(attrs.offset))
		return -EINVAL;

	if (attrs.offset > inode->i_size ||
	    attrs.offset + attrs.size > inode->i_size)
		return -EINVAL;

	r = __kvm_gmem_set_attributes(inode, &attrs, &err_index);
	if (r) {
		attrs.offset = err_index << PAGE_SHIFT;
		if (copy_to_user(argp, &attrs, sizeof(attrs)))
			return -EFAULT;

		return r;
	}

	return 0;
}

static long kvm_gmem_ioctl(struct file *file, unsigned int ioctl,
			   unsigned long arg)
{
	switch (ioctl) {
	case KVM_SET_MEMORY_ATTRIBUTES:
		return kvm_gmem_set_attributes(file, (void __user *)arg);
	default:
		return -ENOTTY;
	}
}

> I think fundamentally the introduction of the guest_memfd ioctls was
> motivated by how private/shared state is a property of memory and not a
> property of the VM. (IIRC you were the one to most succinctly phrase it
> this way on one of the guest_memfd biweeklies.) So I hope you don't mean
> to revert to doing conversions through a VM ioctl.

I do not.  Ah shoot.  I responded to my (much earlier) mail on this to clarify
exactly this point, but I botched the Cc and threading, and it didn't make it's
way to you.

https://lore.kernel.org/all/aNxxJodpbHceb3rF@google.com

> > But for specifying PRIVATE vs. SHARED, I don't see any reason to define new uAPI.
> > I also don't want an entirely new set of terms in KVM to describe the same things.
> > PRIVATE and SHARED are far from perfect, but they're better than https://xkcd.com/927.
> > And if we ever want to let userspace restrict RWX protections in gmem, we'll have
> > a ready-made way to do so.  
> >
> 
> Would like to understand more about RWX protections: is the use case to
> let userspace specify that certain ranges of guest memory are to be
> mapped into stage 2 page tables without executable permissions?

Yep.  Or execute-only.  Or read-only.  The primary use case I'm aware of is for
supporting things VBS (Hyper-V's virtualization based security) and HEKI[1] (which
is effectively the same thing as VBS, and is indeed being dropped in favor of
simply piggybacking the VBS guest<=>host ABI).

VBS allows the kernel to deprivilege itself, and hoist a small amount of code
into a more privilege "thing".  In KVM, the separate privilege domains will be
called "planes"[2].  For RWX protections, the more privileged plane would have
full RWX access to all of guest memory, while the deprivilege kernel will have
select chunks of memory mapped RO (e.g. kernel page tables, GDT, IDT, etc., or
potentially not at all (see Credential Guard).

I don't know if tracking per-plane RWX state in guest_memfd would be a good idea,
but it costs practically nothing to keep the possibility open.

[1] https://lore.kernel.org/all/20231113022326.24388-1-mic@digikod.net
[2] https://lore.kernel.org/all/20250401161106.790710-1-pbonzini@redhat.com

> Is there a further use case to let the guest specify that userspace must
> not mmap() some ranges as executable?
> 
> For guest_memfd the userspace mapping permissions are already defined
> by userspace and so unless guest_memfd must enforce something on behalf
> of the guest, there shouldn't be anything more that guest_memfd should
> track with respect to RWX permissions.

But not all userspaces are created equal.  E.g. if a VM is sharing memory with
another entity, it might want to restrict that sharing to be read-only.  I don't
know that memory attributes would be the best way to express such rules, just
saying that fully relying on mmap() has limitations.

> > Internally, that let's us do some fun things in KVM.  E.g. if we make the "disable
> > legacy per-VM memory attributes" a read-only module param, then we can wire up a
> > static_call() for kvm_get_memory_attributes() and then kvm_mem_is_private() will
> > Just Work.
> >
> >   static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
> >   {
> > 	return static_call(__kvm_get_memory_attributes)(kvm, gfn);
> >   }
> >
> >   static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> >   {
> > 	return kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> >   }
> >
> > That might trigger some additional surgery if/when we want to support RWX
> > protections on a per-VM basis _and_ a per-gmem basic, but I suspect such churn
> > would pale in comparison to the overall support needed for RWX protections.
> >
> 
> RWX protections are more of a VM-level property, if I understood the use
> case correctly that some gfn ranges are to be marked non-executable by
> userspace. Setting RWX within guest_memfd would be kind of awkward since
> userspace must first translate GFN to offset, then set it using the
> offset within guest_memfd. Hence I think it's okay to have RWX stuff go
> through the regular KVM_SET_MEMORY_ATTRIBUTES *VM* ioctl and have it
> tracked in mem_attr_array.

Maybe.  It will depend on how the use cases shake out.  E.g. before the planes
idea came along, the proposal for supporting different privilege levels was to
represent each privilege level with its own "struct kvm", at which point tracking
RWX protections per-VM (struct kvm) made sense.

But with planes, that's no longer true.  E.g. we'd need RWX flags for plane0
and separate RWX flags for plane1 regardless of whether they're tracked in
struct kvm or in the gmem instance.

To be clear, I don't have an opinion one way or the other, because what we'll
end up with is quite unclear.  All I was calling out is that reusing
KVM_SET_MEMORY_ATTRIBUTES provides a lot of the plumbing, _if_ we want to define
RWX protections on a gmem instance.

> I'd prefer not to have the module param choose between the use of
> mem_attr_array and guest_memfd conversion in case we need both
> mem_attr_array to support other stuff in future while supporting
> conversions.

Luckily, we don't actually need to make a decision on this, because PRIVATE is
the only attribute that exists.  Which is partly why I want to go with a module
param.  We can make the behavior very definitive without significant risk of
causing ABI hell.

It's entirely possible I'm completely wrong and we'll end up with per-VM RWX
protections and no other per-gmem memory attributes, but as above, unwinding or
adjusting the module param will be a drop in the bucket compared to the effort
needed to add whatever support comes along.

> > The kvm_memory_attributes structure is compatible, all that's needed AFAICT is a
> > union to clarify it's a pgoff instead of an address when used for guest_memfd.
> >
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index 52f6000ab020..e0d8255ac8d2 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -1590,7 +1590,10 @@ struct kvm_stats_desc {
> >  #define KVM_SET_MEMORY_ATTRIBUTES              _IOW(KVMIO,  0xd2, struct kvm_memory_attributes)
> >  
> >  struct kvm_memory_attributes {
> > -       __u64 address;
> > +       union {
> > +               __u64 address;
> > +               __u64 offset;
> > +       };
> >         __u64 size;
> >         __u64 attributes;
> >         __u64 flags;
> >
> 
> struct kvm_memory_attributes doesn't have room for reporting the offset
> at which conversion failed (error_offset in the new struct). How do we
> handle this? Do we reuse the flags field, or do we not report
> error_offset?

Write back at address/offset (and update size too, which I probably forgot to do).
Ugh, but it's defined _IOW.  I forget if that matters in practice (IIRC, it's not
enforced anywhere, i.e. purely informational for userspace).

> >>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> >>  				    pgoff_t index, struct folio *folio)
> >>  {
> >> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
> >>  
> >>  	filemap_invalidate_lock_shared(inode->i_mapping);
> >>  
> >> -	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> >> +	folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
> >
> > I am fairly certain there's a TOCTOU bug here.  AFAICT, nothing prevents the
> > underlying memory from being converted from shared=>private after checking that
> > the page is SHARED.
> >
> 
> Conversions take the filemap_invalidate_lock() too, along with
> allocations, truncations.
> 
> Because the filemap_invalidate_lock() might be reused for other
> fs-specific operations, I didn't do the mt_set_external_lock() thing to
> lock at a low level to avoid nested locking or special maple tree code
> to avoid taking the lock on other paths.

mt_set_external_lock() is a nop.  It exists purely for lockdep assertions.  Per
the comment for MT_FLAGS_LOCK_EXTERN, "mt_lock is not used", LOCK_EXTERN simply
tells maple tree to not use/take mt_lock.   I.e. it doesn't say "take this lock
instead", it says "I'll handle locking".

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ackerley Tng 4 months, 1 week ago

Sean Christopherson <seanjc@google.com> writes:

> On Wed, Oct 01, 2025, Ackerley Tng wrote:
>> Sean Christopherson <seanjc@google.com> writes:
>> > On Wed, May 14, 2025, Ackerley Tng wrote:
>> >> +enum shareability {
>> >> +	SHAREABILITY_GUEST = 1,	/* Only the guest can map (fault) folios in this range. */
>> >> +	SHAREABILITY_ALL = 2,	/* Both guest and host can fault folios in this range. */
>> >> +};
>> >
>> > Rather than define new values and new KVM uAPI, I think we should instead simply
>> > support KVM_SET_MEMORY_ATTRIBUTES.  We'll probably need a new CAP, as I'm not sure
>> > supporting KVM_CHECK_EXTENSION+KVM_CAP_MEMORY_ATTRIBUTES on a gmem fd would be a
>> > good idea (e.g. trying to do KVM_CAP_GUEST_MEMFD_FLAGS on a gmem fd doesn't work
>> > because the whole point is to get flags _before_ creating the gmem instance).  But
>> > adding e.g. KVM_CAP_GUEST_MEMFD_MEMORY_ATTRIBUTES is easy enough.
>> >
>> 
>> I've read this a few times and I'm a bit confused, so just making sure:
>> you are suggesting that we reuse the KVM_SET_MEMORY_ATTRIBUTES ioctl as
>> a guest_memfd (not a VM) ioctl and still store private/shared state
>> within guest_memfd, right?
>
> Yep.  Something like:
>
> static long kvm_gmem_set_attributes(struct file *file, void __user *argp)
> {
> 	struct gmem_file *f = file->private_data;
> 	struct inode *inode = file_inode(file);
> 	struct kvm_memory_attributes attrs;
> 	pgoff_t err_index;
> 	int r;
>
> 	if (copy_from_user(&attrs, argp, sizeof(attrs)))
> 		return -EFAULT;
>
> 	if (attrs.flags)
> 		return -EINVAL;
> 	if (attrs.attributes & ~kvm_gmem_supported_mem_attributes(f))
> 		return -EINVAL;
> 	if (attrs.size == 0 || attrs.offset + attrs.size < attrs.offset)
> 		return -EINVAL;
> 	if (!PAGE_ALIGNED(attrs.offset) || !PAGE_ALIGNED(attrs.offset))
> 		return -EINVAL;
>
> 	if (attrs.offset > inode->i_size ||
> 	    attrs.offset + attrs.size > inode->i_size)
> 		return -EINVAL;
>
> 	r = __kvm_gmem_set_attributes(inode, &attrs, &err_index);
> 	if (r) {
> 		attrs.offset = err_index << PAGE_SHIFT;
> 		if (copy_to_user(argp, &attrs, sizeof(attrs)))
> 			return -EFAULT;
>
> 		return r;
> 	}
>
> 	return 0;
> }
>
> static long kvm_gmem_ioctl(struct file *file, unsigned int ioctl,
> 			   unsigned long arg)
> {
> 	switch (ioctl) {
> 	case KVM_SET_MEMORY_ATTRIBUTES:
> 		return kvm_gmem_set_attributes(file, (void __user *)arg);
> 	default:
> 		return -ENOTTY;
> 	}
> }
>
>> I think fundamentally the introduction of the guest_memfd ioctls was
>> motivated by how private/shared state is a property of memory and not a
>> property of the VM. (IIRC you were the one to most succinctly phrase it
>> this way on one of the guest_memfd biweeklies.) So I hope you don't mean
>> to revert to doing conversions through a VM ioctl.
>
> I do not.  Ah shoot.  I responded to my (much earlier) mail on this to clarify
> exactly this point, but I botched the Cc and threading, and it didn't make it's
> way to you.
>
> https://lore.kernel.org/all/aNxxJodpbHceb3rF@google.com
>

Phew. Good to have this clarified.

>> 
>> [...snip...]
>> 
>> I'd prefer not to have the module param choose between the use of
>> mem_attr_array and guest_memfd conversion in case we need both
>> mem_attr_array to support other stuff in future while supporting
>> conversions.
>
> Luckily, we don't actually need to make a decision on this, because PRIVATE is
> the only attribute that exists.  Which is partly why I want to go with a module
> param.  We can make the behavior very definitive without significant risk of
> causing ABI hell.
>

Then maybe I'm misunderstanding the static_call() thing you were
describing. Is it like, at KVM module initialization time,

    if module_param == disable_tracking:
        .__kvm_get_memory_attributes = read_attributes_from_guest_memfd
    else
        .__kvm_get_memory_attributes = read_attributes_from_mem_attr_array

With that, I can't have both CoCo private/shared state tracked in
guest_memfd and RWX (as an example, could be any future attribute)
tracked in mem_attr_array on the same VM.
    
> It's entirely possible I'm completely wrong and we'll end up with per-VM RWX
> protections and no other per-gmem memory attributes, but as above, unwinding or
> adjusting the module param will be a drop in the bucket compared to the effort
> needed to add whatever support comes along.
>

Is a module param a weaker userspace contract such that the definition
for module params can be more flexibly adjusted?

>> > The kvm_memory_attributes structure is compatible, all that's needed AFAICT is a
>> > union to clarify it's a pgoff instead of an address when used for guest_memfd.
>> >
>> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> > index 52f6000ab020..e0d8255ac8d2 100644
>> > --- a/include/uapi/linux/kvm.h
>> > +++ b/include/uapi/linux/kvm.h
>> > @@ -1590,7 +1590,10 @@ struct kvm_stats_desc {
>> >  #define KVM_SET_MEMORY_ATTRIBUTES              _IOW(KVMIO,  0xd2, struct kvm_memory_attributes)
>> >  
>> >  struct kvm_memory_attributes {
>> > -       __u64 address;
>> > +       union {
>> > +               __u64 address;
>> > +               __u64 offset;
>> > +       };
>> >         __u64 size;
>> >         __u64 attributes;
>> >         __u64 flags;
>> >
>> 
>> struct kvm_memory_attributes doesn't have room for reporting the offset
>> at which conversion failed (error_offset in the new struct). How do we
>> handle this? Do we reuse the flags field, or do we not report
>> error_offset?
>
> Write back at address/offset

I think it might be surprising to the userspace program, when it wants
to check the offset that it had requested and found that it changed due
to an error, or upon decoding the error, be unable to find the original
offset it had requested. Like,

    printf("Error during conversion from offset=%lx with size=%lx, at
           error_offset=%lx", attr.offset, attr.size, attr.error_offset)

would be nicer than 

    original_offset = attr.offset
    printf("Error during conversion from offset=%lx with size=%lx, at
           error_offset=%lx", original_offset, attr.size, attr.error_offset)
           
> (and update size too, which I probably forgot to do).

Why does size need to be updated? I think u64 for size is great, and
size is better than nr_pages since nr_pages differs on different
platforms based on PAGE_SIZE and also nr_pages introduces the question
of "was it hugetlb, or a native page size?".

> Ugh, but it's defined _IOW.  I forget if that matters in practice (IIRC, it's not
> enforced anywhere, i.e. purely informational for userspace).
>

I didn't notice this IOW vs IORW part, but if it starts getting
enforced/specified [1] or auto-documented we'd be in trouble.

At this point, maybe it's better to just have a different ioctl number
and struct definition. I feel that it would be easier for a user to
associate/separate

+ KVM_SET_MEMORY_ATTRIBUTES
    + Is VM ioctl
    + Is a write-only ioctl
    + Is for setting memory attributes at a VM level
    + Use struct kvm_memory_attributes for this
+ KVM_GUEST_MEMFD_SET_MEMORY_ATTRIBUTES (name TBD)
    + Is guest_memfd ioctl
    + Is a read/write ioctl
    + Is for setting memory attributes only for this guest_memfd
    + Use struct guest_memfd_memory_attributes for this
    + Also decode errors from this struct

[1] https://lore.kernel.org/all/20250825181434.3340805-1-sashal@kernel.org/

>> >>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>> >>  				    pgoff_t index, struct folio *folio)
>> >>  {
>> >> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
>> >>  
>> >>  	filemap_invalidate_lock_shared(inode->i_mapping);
>> >>  
>> >> -	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
>> >> +	folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
>> >
>> > I am fairly certain there's a TOCTOU bug here.  AFAICT, nothing prevents the
>> > underlying memory from being converted from shared=>private after checking that
>> > the page is SHARED.
>> >
>> 
>> Conversions take the filemap_invalidate_lock() too, along with
>> allocations, truncations.
>> 
>> Because the filemap_invalidate_lock() might be reused for other
>> fs-specific operations, I didn't do the mt_set_external_lock() thing to
>> lock at a low level to avoid nested locking or special maple tree code
>> to avoid taking the lock on other paths.
>
> mt_set_external_lock() is a nop.  It exists purely for lockdep assertions.  Per
> the comment for MT_FLAGS_LOCK_EXTERN, "mt_lock is not used", LOCK_EXTERN simply
> tells maple tree to not use/take mt_lock.   I.e. it doesn't say "take this lock
> instead", it says "I'll handle locking".

Thanks for pointing this out!

Conversions (and others) taking the filemap_invalidate_lock() probably
fixes the TOCTOU bug, right?

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Sean Christopherson 4 months, 1 week ago

On Wed, Oct 01, 2025, Ackerley Tng wrote:
> Sean Christopherson <seanjc@google.com> writes:
> >> I'd prefer not to have the module param choose between the use of
> >> mem_attr_array and guest_memfd conversion in case we need both
> >> mem_attr_array to support other stuff in future while supporting
> >> conversions.
> >
> > Luckily, we don't actually need to make a decision on this, because PRIVATE is
> > the only attribute that exists.  Which is partly why I want to go with a module
> > param.  We can make the behavior very definitive without significant risk of
> > causing ABI hell.
> >
> 
> Then maybe I'm misunderstanding the static_call() thing you were
> describing. Is it like, at KVM module initialization time,
> 
>     if module_param == disable_tracking:
>         .__kvm_get_memory_attributes = read_attributes_from_guest_memfd
>     else
>         .__kvm_get_memory_attributes = read_attributes_from_mem_attr_array
> 
> With that, I can't have both CoCo private/shared state tracked in
> guest_memfd and RWX (as an example, could be any future attribute)
> tracked in mem_attr_array on the same VM.

More or less.

> > It's entirely possible I'm completely wrong and we'll end up with per-VM RWX
> > protections and no other per-gmem memory attributes, but as above, unwinding or
> > adjusting the module param will be a drop in the bucket compared to the effort
> > needed to add whatever support comes along.
> >
> 
> Is a module param a weaker userspace contract such that the definition
> for module params can be more flexibly adjusted?

Yes, much weaker.

> >> > The kvm_memory_attributes structure is compatible, all that's needed AFAICT is a
> >> > union to clarify it's a pgoff instead of an address when used for guest_memfd.
> >> >
> >> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> >> > index 52f6000ab020..e0d8255ac8d2 100644
> >> > --- a/include/uapi/linux/kvm.h
> >> > +++ b/include/uapi/linux/kvm.h
> >> > @@ -1590,7 +1590,10 @@ struct kvm_stats_desc {
> >> >  #define KVM_SET_MEMORY_ATTRIBUTES              _IOW(KVMIO,  0xd2, struct kvm_memory_attributes)
> >> >  
> >> >  struct kvm_memory_attributes {
> >> > -       __u64 address;
> >> > +       union {
> >> > +               __u64 address;
> >> > +               __u64 offset;
> >> > +       };
> >> >         __u64 size;
> >> >         __u64 attributes;
> >> >         __u64 flags;
> >> >
> >> 
> >> struct kvm_memory_attributes doesn't have room for reporting the offset
> >> at which conversion failed (error_offset in the new struct). How do we
> >> handle this? Do we reuse the flags field, or do we not report
> >> error_offset?
> >
> > Write back at address/offset
> 
> I think it might be surprising to the userspace program, when it wants
> to check the offset that it had requested and found that it changed due
> to an error, or upon decoding the error, be unable to find the original
> offset it had requested.

It's a somewhat common pattern in the kernel.  Updating the offset+size is most
often used with -EAGAIN to say "got this far, try the syscall again from this
point".

> Like,
> 
>     printf("Error during conversion from offset=%lx with size=%lx, at
>            error_offset=%lx", attr.offset, attr.size, attr.error_offset)
> 
> would be nicer than 
> 
>     original_offset = attr.offset
>     printf("Error during conversion from offset=%lx with size=%lx, at
>            error_offset=%lx", original_offset, attr.size, attr.error_offset)
>            
> > (and update size too, which I probably forgot to do).
> 
> Why does size need to be updated? I think u64 for size is great, and
> size is better than nr_pages since nr_pages differs on different
> platforms based on PAGE_SIZE and also nr_pages introduces the question
> of "was it hugetlb, or a native page size?".

I meant update the number of bytes remaining when updating the offset so that
userspace can redo the ioctl without having to update parameters.

> > Ugh, but it's defined _IOW.  I forget if that matters in practice (IIRC, it's not
> > enforced anywhere, i.e. purely informational for userspace).
> >
> 
> I didn't notice this IOW vs IORW part, but if it starts getting
> enforced/specified [1] or auto-documented we'd be in trouble.

IOW vs IORW is alread specified in the ioctl.  More below.

> At this point, maybe it's better to just have a different ioctl number
> and struct definition. I feel that it would be easier for a user to
> associate/separate

Amusingly, we'd only need a different name along with the IORW thing.  A full
ioctl number is comproised of the "directory" (KVM), the number, the size of the
payload, and how the payload is accessed.

#define _IOC(dir,type,nr,size) \
	(((dir)  << _IOC_DIRSHIFT) | \
	 ((type) << _IOC_TYPESHIFT) | \
	 ((nr)   << _IOC_NRSHIFT) | \
	 ((size) << _IOC_SIZESHIFT))

So this:

  #define KVM_SET_MEMORY_ATTRIBUTES	_IOW(KVMIO,  0xd2, struct kvm_memory_attributes)
  #define KVM_SET_MEMORY_ATTRIBUTES2	_IOWR(KVMIO, 0xd2, struct kvm_memory_attributes2)

actually generates two different values, and so is two different ioctls from a
code perspective.

The "size" of the payload is nice to have as it allows userspace to assert that
it's passing the right structure, e.g. this static assert from KVM selftests:

#define kvm_do_ioctl(fd, cmd, arg)						\
({										\
	kvm_static_assert(!_IOC_SIZE(cmd) || sizeof(*arg) == _IOC_SIZE(cmd));	\
	ioctl(fd, cmd, arg);							\
})

> + KVM_SET_MEMORY_ATTRIBUTES
>     + Is VM ioctl
>     + Is a write-only ioctl
>     + Is for setting memory attributes at a VM level
>     + Use struct kvm_memory_attributes for this
> + KVM_GUEST_MEMFD_SET_MEMORY_ATTRIBUTES (name TBD)
>     + Is guest_memfd ioctl
>     + Is a read/write ioctl
>     + Is for setting memory attributes only for this guest_memfd
>     + Use struct guest_memfd_memory_attributes for this
>     + Also decode errors from this struct

      + Has extra padding for future expansion (because why not)

If we really truly need a new ioctl, I'd probably prefer KVM_SET_MEMORY_ATTRIBUTES2.
Yeah, it's silly, but I don't think baking GUEST_MEMFD into the names buys us
anything.  Then we can use KVM_SET_MEMORY_ATTRIBUTES2 on a VM if the need ever
arises.

Alternative #1 is to try and unwind on failure, but that gets complex, and it
simply can't be done for some CoCo VMs.  E.g. a private=>shared conversion for
TDX is descrutive.

Alternative #2 is to make the updates atomic and all-or-nothing, which is what
we did for per-VM attributes.  That's doable, but it'd either be much more
complex than telling userspace to retry, or we'd have to lose the maple tree
optimizations (which is effectively what we did for per-VM attributes).

> [1] https://lore.kernel.org/all/20250825181434.3340805-1-sashal@kernel.org/
> 
> >> >>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> >> >>  				    pgoff_t index, struct folio *folio)
> >> >>  {
> >> >> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
> >> >>  
> >> >>  	filemap_invalidate_lock_shared(inode->i_mapping);
> >> >>  
> >> >> -	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> >> >> +	folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
> >> >
> >> > I am fairly certain there's a TOCTOU bug here.  AFAICT, nothing prevents the
> >> > underlying memory from being converted from shared=>private after checking that
> >> > the page is SHARED.
> >> >
> >> 
> >> Conversions take the filemap_invalidate_lock() too, along with
> >> allocations, truncations.
> >> 
> >> Because the filemap_invalidate_lock() might be reused for other
> >> fs-specific operations, I didn't do the mt_set_external_lock() thing to
> >> lock at a low level to avoid nested locking or special maple tree code
> >> to avoid taking the lock on other paths.
> >
> > mt_set_external_lock() is a nop.  It exists purely for lockdep assertions.  Per
> > the comment for MT_FLAGS_LOCK_EXTERN, "mt_lock is not used", LOCK_EXTERN simply
> > tells maple tree to not use/take mt_lock.   I.e. it doesn't say "take this lock
> > instead", it says "I'll handle locking".
> 
> Thanks for pointing this out!
> 
> Conversions (and others) taking the filemap_invalidate_lock() probably
> fixes the TOCTOU bug, right?

Yes, grabbing a reference to the folio under lock and thus elevating its refcount
should prevent conversions to private from that point forward, until the PTE is
zapped and the folio is released:

	filemap_invalidate_lock_shared(inode->i_mapping);
	if (kvm_gmem_is_shared_mem(inode, vmf->pgoff))
		folio = kvm_gmem_get_folio(inode, vmf->pgoff);
	else
		folio = ERR_PTR(-EACCES);
	filemap_invalidate_unlock_shared(inode->i_mapping);

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ackerley Tng 4 months, 1 week ago

Sean Christopherson <seanjc@google.com> writes:

> On Wed, Oct 01, 2025, Ackerley Tng wrote:
>> Sean Christopherson <seanjc@google.com> writes:
>> >> I'd prefer not to have the module param choose between the use of
>> >> mem_attr_array and guest_memfd conversion in case we need both
>> >> mem_attr_array to support other stuff in future while supporting
>> >> conversions.
>> >
>> > Luckily, we don't actually need to make a decision on this, because PRIVATE is
>> > the only attribute that exists.  Which is partly why I want to go with a module
>> > param.  We can make the behavior very definitive without significant risk of
>> > causing ABI hell.
>> >
>> 
>> Then maybe I'm misunderstanding the static_call() thing you were
>> describing. Is it like, at KVM module initialization time,
>> 
>>     if module_param == disable_tracking:
>>         .__kvm_get_memory_attributes = read_attributes_from_guest_memfd
>>     else
>>         .__kvm_get_memory_attributes = read_attributes_from_mem_attr_array
>> 
>> With that, I can't have both CoCo private/shared state tracked in
>> guest_memfd and RWX (as an example, could be any future attribute)
>> tracked in mem_attr_array on the same VM.
>
> More or less.
>

Hm okay. So introducing the module param will only allow the use of one
of the following?

+ KVM_SET_MEMORY_ATTRIBUTES (vm ioctl)
+ KVM_SET_MEMORY_ATTRIBUTES2 (guest_memfd ioctl)

Then I guess using a module param which is a weaker userspace contract
allows us to later enable both vm and guest_memfd ioctl if the need
arises?

>> > It's entirely possible I'm completely wrong and we'll end up with per-VM RWX
>> > protections and no other per-gmem memory attributes, but as above, unwinding or
>> > adjusting the module param will be a drop in the bucket compared to the effort
>> > needed to add whatever support comes along.
>> >
>> 
>> Is a module param a weaker userspace contract such that the definition
>> for module params can be more flexibly adjusted?
>
> Yes, much weaker.
>

I have a new tool in my toolbox now :)

>> >> > The kvm_memory_attributes structure is compatible, all that's needed AFAICT is a
>> >> > union to clarify it's a pgoff instead of an address when used for guest_memfd.
>> >> >
>> >> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> >> > index 52f6000ab020..e0d8255ac8d2 100644
>> >> > --- a/include/uapi/linux/kvm.h
>> >> > +++ b/include/uapi/linux/kvm.h
>> >> > @@ -1590,7 +1590,10 @@ struct kvm_stats_desc {
>> >> >  #define KVM_SET_MEMORY_ATTRIBUTES              _IOW(KVMIO,  0xd2, struct kvm_memory_attributes)
>> >> >  
>> >> >  struct kvm_memory_attributes {
>> >> > -       __u64 address;
>> >> > +       union {
>> >> > +               __u64 address;
>> >> > +               __u64 offset;
>> >> > +       };
>> >> >         __u64 size;
>> >> >         __u64 attributes;
>> >> >         __u64 flags;
>> >> >
>> >> 
>> >> struct kvm_memory_attributes doesn't have room for reporting the offset
>> >> at which conversion failed (error_offset in the new struct). How do we
>> >> handle this? Do we reuse the flags field, or do we not report
>> >> error_offset?
>> >
>> > Write back at address/offset
>> 
>> I think it might be surprising to the userspace program, when it wants
>> to check the offset that it had requested and found that it changed due
>> to an error, or upon decoding the error, be unable to find the original
>> offset it had requested.
>
> It's a somewhat common pattern in the kernel.  Updating the offset+size is most
> often used with -EAGAIN to say "got this far, try the syscall again from this
> point".
>

TIL, thanks!

>> Like,
>> 
>>     printf("Error during conversion from offset=%lx with size=%lx, at
>>            error_offset=%lx", attr.offset, attr.size, attr.error_offset)
>> 
>> would be nicer than 
>> 
>>     original_offset = attr.offset
>>     printf("Error during conversion from offset=%lx with size=%lx, at
>>            error_offset=%lx", original_offset, attr.size, attr.error_offset)
>>            
>> > (and update size too, which I probably forgot to do).
>> 
>> Why does size need to be updated? I think u64 for size is great, and
>> size is better than nr_pages since nr_pages differs on different
>> platforms based on PAGE_SIZE and also nr_pages introduces the question
>> of "was it hugetlb, or a native page size?".
>
> I meant update the number of bytes remaining when updating the offset so that
> userspace can redo the ioctl without having to update parameters.
>
>> > Ugh, but it's defined _IOW.  I forget if that matters in practice (IIRC, it's not
>> > enforced anywhere, i.e. purely informational for userspace).
>> >
>> 
>> I didn't notice this IOW vs IORW part, but if it starts getting
>> enforced/specified [1] or auto-documented we'd be in trouble.
>
> IOW vs IORW is alread specified in the ioctl.  More below.
>
>> At this point, maybe it's better to just have a different ioctl number
>> and struct definition. I feel that it would be easier for a user to
>> associate/separate
>
> Amusingly, we'd only need a different name along with the IORW thing.  A full
> ioctl number is comproised of the "directory" (KVM), the number, the size of the
> payload, and how the payload is accessed.
>
> #define _IOC(dir,type,nr,size) \
> 	(((dir)  << _IOC_DIRSHIFT) | \
> 	 ((type) << _IOC_TYPESHIFT) | \
> 	 ((nr)   << _IOC_NRSHIFT) | \
> 	 ((size) << _IOC_SIZESHIFT))
>
> So this:
>
>   #define KVM_SET_MEMORY_ATTRIBUTES	_IOW(KVMIO,  0xd2, struct kvm_memory_attributes)
>   #define KVM_SET_MEMORY_ATTRIBUTES2	_IOWR(KVMIO, 0xd2, struct kvm_memory_attributes2)
>
> actually generates two different values, and so is two different ioctls from a
> code perspective.
>
> The "size" of the payload is nice to have as it allows userspace to assert that
> it's passing the right structure, e.g. this static assert from KVM selftests:
>
> #define kvm_do_ioctl(fd, cmd, arg)						\
> ({										\
> 	kvm_static_assert(!_IOC_SIZE(cmd) || sizeof(*arg) == _IOC_SIZE(cmd));	\
> 	ioctl(fd, cmd, arg);							\
> })
>
>> + KVM_SET_MEMORY_ATTRIBUTES
>>     + Is VM ioctl
>>     + Is a write-only ioctl
>>     + Is for setting memory attributes at a VM level
>>     + Use struct kvm_memory_attributes for this
>> + KVM_GUEST_MEMFD_SET_MEMORY_ATTRIBUTES (name TBD)
>>     + Is guest_memfd ioctl
>>     + Is a read/write ioctl
>>     + Is for setting memory attributes only for this guest_memfd
>>     + Use struct guest_memfd_memory_attributes for this
>>     + Also decode errors from this struct
>
>       + Has extra padding for future expansion (because why not)
>
> If we really truly need a new ioctl, I'd probably prefer KVM_SET_MEMORY_ATTRIBUTES2.
> Yeah, it's silly, but I don't think baking GUEST_MEMFD into the names buys us
> anything.  Then we can use KVM_SET_MEMORY_ATTRIBUTES2 on a VM if the need ever
> arises.
>

I'm for having a new ioctl number and new struct, which are you leaning
towards?

As for the naming, I think it's confusing to have something similar, and
Ira mentioned it being confusing in the other email too. At the same
time, I accept that it's useful if the same struct were to be used for a
new iteration of the KVM_SET_MEMORY_ATTRIBUTES VM ioctl in future. No
strong preference either way on naming.


Trying to understand the difference between unwind on failure vs
all-or-nothing:

> Alternative #1 is to try and unwind on failure, but that gets complex, and it
> simply can't be done for some CoCo VMs.  E.g. a private=>shared conversion for
> TDX is descrutive.
>

Unwind on failure is:

1. Store current state
2. Convert
3. Restore current state on conversion failure

> Alternative #2 is to make the updates atomic and all-or-nothing, which is what
> we did for per-VM attributes.  That's doable, but it'd either be much more
> complex than telling userspace to retry, or we'd have to lose the maple tree
> optimizations (which is effectively what we did for per-VM attributes).
>

All-or-nothing:

1. Do everything to make sure conversion doesn't fail, bail early if it
   fails
2. Convert (always successful)

Is that it?


Zapping private pages from the stage 2 page tables for TDX can't be
recovered without help from the guest (I think that's what you're
talking about too), although technically I think this zapping step could
be delayed right till the end.

Maple tree allocations for conversion could fail, and allocations are a
bit more complicated since we try to compact ranges with the same
shared/private status into one same maple tree node. Still technically
possible, maybe by updating a copy of the maple tree first, then
swapping the current maple tree out atomically.

With HugeTLB, undoing HVO needs pages to be allocated, will need more
digging into the details to determine if preallocation could work.

I'd still prefer having the option to return an error so that we don't
paint ourselves into a corner.

>> [1] https://lore.kernel.org/all/20250825181434.3340805-1-sashal@kernel.org/
>> 
>> 
>> [...snip...]
>>

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ackerley Tng 3 months, 3 weeks ago

Ackerley Tng <ackerleytng@google.com> writes:

> 
> [...snip...]
> 
>>> >> > The kvm_memory_attributes structure is compatible, all that's needed AFAICT is a
>>> >> > union to clarify it's a pgoff instead of an address when used for guest_memfd.
>>> >> >
>>> >> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>>> >> > index 52f6000ab020..e0d8255ac8d2 100644
>>> >> > --- a/include/uapi/linux/kvm.h
>>> >> > +++ b/include/uapi/linux/kvm.h
>>> >> > @@ -1590,7 +1590,10 @@ struct kvm_stats_desc {
>>> >> >  #define KVM_SET_MEMORY_ATTRIBUTES              _IOW(KVMIO,  0xd2, struct kvm_memory_attributes)
>>> >> >  
>>> >> >  struct kvm_memory_attributes {
>>> >> > -       __u64 address;
>>> >> > +       union {
>>> >> > +               __u64 address;
>>> >> > +               __u64 offset;
>>> >> > +       };
>>> >> >         __u64 size;
>>> >> >         __u64 attributes;
>>> >> >         __u64 flags;
>>> >> >
>>> >> 
>>> >> struct kvm_memory_attributes doesn't have room for reporting the offset
>>> >> at which conversion failed (error_offset in the new struct). How do we
>>> >> handle this? Do we reuse the flags field, or do we not report
>>> >> error_offset?
>>> >
>>> > Write back at address/offset
>>> 
>>> I think it might be surprising to the userspace program, when it wants
>>> to check the offset that it had requested and found that it changed due
>>> to an error, or upon decoding the error, be unable to find the original
>>> offset it had requested.
>>
>> It's a somewhat common pattern in the kernel.  Updating the offset+size is most
>> often used with -EAGAIN to say "got this far, try the syscall again from this
>> point".
>>
>
> TIL, thanks!
>
>>> Like,
>>> 
>>>     printf("Error during conversion from offset=%lx with size=%lx, at
>>>            error_offset=%lx", attr.offset, attr.size, attr.error_offset)
>>> 
>>> would be nicer than 
>>> 
>>>     original_offset = attr.offset
>>>     printf("Error during conversion from offset=%lx with size=%lx, at
>>>            error_offset=%lx", original_offset, attr.size, attr.error_offset)
>>>            
>>> > (and update size too, which I probably forgot to do).
>>> 
>>> Why does size need to be updated? I think u64 for size is great, and
>>> size is better than nr_pages since nr_pages differs on different
>>> platforms based on PAGE_SIZE and also nr_pages introduces the question
>>> of "was it hugetlb, or a native page size?".
>>
>> I meant update the number of bytes remaining when updating the offset so that
>> userspace can redo the ioctl without having to update parameters.
>>

Was working through this again, I think the attr.offset returned from
the conversion ioctl is not the same as other syscalls where an updated
offset+size indicates "got this far, try the syscall again from this
point".

For the conversion ioctl, -EAGAIN indicates that a some unexpected
refcount was first found at offset error_offset, but does not imply that
everything up till error_offset had been converted.

This arises when we start to have hugepage support. To restructure
hugepage-by-hugepage, we will iterate hugepage-wise and check for
elevated refcounts.

Suppose we're converting 10 1G pages and on the 3rd hugepage, the 5th
offset has an elevated refcount.

error_offset should be set to the 5th offset in the 3rd hugepage, but
userspace should retry beginning at the offset of the 3rd hugepage with
size 8G.

If the offset returned to userspace is the 3rd hugepage, then we lose
precision. The refcount at the 3rd hugepage could be fine and expected -
it is the page at the 5th offset in the 3rd hugepage that is pinned and
userspace should be unpin.

So perhaps the interface needs to be defined as

If the error is -EAGAIN:
   + offset: the offset to retry from
   + size: the remaining size to retry
   + error_offset: the offset where an unexpected refcount was found

>>> 
>>> [...snip...]
>>>

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Sean Christopherson 4 months, 1 week ago

On Thu, Oct 02, 2025, Ackerley Tng wrote:
> Sean Christopherson <seanjc@google.com> writes:
> 
> > On Wed, Oct 01, 2025, Ackerley Tng wrote:
> >> Sean Christopherson <seanjc@google.com> writes:
> >> >> I'd prefer not to have the module param choose between the use of
> >> >> mem_attr_array and guest_memfd conversion in case we need both
> >> >> mem_attr_array to support other stuff in future while supporting
> >> >> conversions.
> >> >
> >> > Luckily, we don't actually need to make a decision on this, because PRIVATE is
> >> > the only attribute that exists.  Which is partly why I want to go with a module
> >> > param.  We can make the behavior very definitive without significant risk of
> >> > causing ABI hell.
> >> >
> >> 
> >> Then maybe I'm misunderstanding the static_call() thing you were
> >> describing. Is it like, at KVM module initialization time,
> >> 
> >>     if module_param == disable_tracking:
> >>         .__kvm_get_memory_attributes = read_attributes_from_guest_memfd
> >>     else
> >>         .__kvm_get_memory_attributes = read_attributes_from_mem_attr_array
> >> 
> >> With that, I can't have both CoCo private/shared state tracked in
> >> guest_memfd and RWX (as an example, could be any future attribute)
> >> tracked in mem_attr_array on the same VM.
> >
> > More or less.
> >
> 
> Hm okay. So introducing the module param will only allow the use of one
> of the following?
> 
> + KVM_SET_MEMORY_ATTRIBUTES (vm ioctl)
> + KVM_SET_MEMORY_ATTRIBUTES2 (guest_memfd ioctl)
> 
> Then I guess using a module param which is a weaker userspace contract
> allows us to later enable both vm and guest_memfd ioctl if the need
> arises?

In theory.  More importantly from my perspective, making the knob global instead
of per-VM simplifies the implementation.

> >> + KVM_SET_MEMORY_ATTRIBUTES
> >>     + Is VM ioctl
> >>     + Is a write-only ioctl
> >>     + Is for setting memory attributes at a VM level
> >>     + Use struct kvm_memory_attributes for this
> >> + KVM_GUEST_MEMFD_SET_MEMORY_ATTRIBUTES (name TBD)
> >>     + Is guest_memfd ioctl
> >>     + Is a read/write ioctl
> >>     + Is for setting memory attributes only for this guest_memfd
> >>     + Use struct guest_memfd_memory_attributes for this
> >>     + Also decode errors from this struct
> >
> >       + Has extra padding for future expansion (because why not)
> >
> > If we really truly need a new ioctl, I'd probably prefer KVM_SET_MEMORY_ATTRIBUTES2.
> > Yeah, it's silly, but I don't think baking GUEST_MEMFD into the names buys us
> > anything.  Then we can use KVM_SET_MEMORY_ATTRIBUTES2 on a VM if the need ever
> > arises.
> >
> 
> I'm for having a new ioctl number and new struct, which are you leaning
> towards?
> 
> As for the naming, I think it's confusing to have something similar, and
> Ira mentioned it being confusing in the other email too.

It might sound confusing, but I don't think the _code_ will be confusing.  If it's
a module param, it's guaranteed to be global for any VMM.  Then the conversion code
can be a simple redirect, e.g. I would expect/hope the QEMU change to be something
like:

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index f89568bfa3..5253cf7275 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -1421,6 +1421,10 @@ static int kvm_set_memory_attributes(hwaddr start, uint64_t size, uint64_t attr)
     int r;
 
     assert((attr & kvm_supported_memory_attributes) == attr);
+
+    if (kvm_use_gmem_attributes)
+        return kvm_get_gmem_fd(start, size, attr);
+
     attrs.attributes = attr;
     attrs.address = start;
     attrs.size = size;

> At the same time, I accept that it's useful if the same struct were to be
> used for a new iteration of the KVM_SET_MEMORY_ATTRIBUTES VM ioctl in future.
> No strong preference either way on naming.
> 
> Trying to understand the difference between unwind on failure vs
> all-or-nothing:
> 
> > Alternative #1 is to try and unwind on failure, but that gets complex, and it
> > simply can't be done for some CoCo VMs.  E.g. a private=>shared conversion for
> > TDX is descrutive.
> >
> 
> Unwind on failure is:
> 
> 1. Store current state
> 2. Convert
> 3. Restore current state on conversion failure

Not quite, the above is missing key steps that cause problems: invalidation and
memory allocation.

 1. Zap stage-1 mappings
 2. Check if range can be converted
 3. Zap stage-2 mappings
 4. Store state for target range

For TDX, #3 is the point of no return.  That means #4 must not fail, because at
the very least invididual page conversions need to be atomic.

> > Alternative #2 is to make the updates atomic and all-or-nothing, which is what
> > we did for per-VM attributes.  That's doable, but it'd either be much more
> > complex than telling userspace to retry, or we'd have to lose the maple tree
> > optimizations (which is effectively what we did for per-VM attributes).
> >
> 
> All-or-nothing:
> 
> 1. Do everything to make sure conversion doesn't fail, bail early if it
>    fails
> 2. Convert (always successful)
> 
> Is that it?

No, because #1 is non-trivial.  The wrinkle is that, without doing an initial
pass through mtree, it's impossible to know how many new entries will be needed
(which you already know).  It's a solvable problem,  e.g. in this RFC, it's kinda
sorta handled by allocating temporary structures to track the mtree metadata, but
that's the type of complexity I want to avoid.

> Zapping private pages from the stage 2 page tables for TDX can't be
> recovered without help from the guest (I think that's what you're
> talking about too), 

Ya.

> although technically I think this zapping step could be delayed right till the end.

Hmm.  If we fully committed to blocking all "gets" via the filemap lock, then
yes, I think it could be delayed until the end?  Tempting.   But as you note
below, I'd prefer to give ourselves an out from an ABI perspective.

> Maple tree allocations for conversion could fail, 

Ya, in my local version I handle that by pre-allocating.

	mas_for_each(&mas, entry, end - 1) {
		MA_STATE(m2, &gi->attributes, 0, 0);

		if (attrs->attributes == xa_to_value(entry))
			continue;

		r = kvm_gmem_mas_preallocate(&m2, attrs->attributes, start, end);
		if (r) {
			*err_index = m2.index;
			goto out;
		}

		unmap_mapping_pages(mapping, start, nr_pages, false);

		if (!kvm_gmem_is_safe_for_conversion(inode, start, nr_pages, err_index)) {
			mas_destroy(&m2);
			r = -EAGAIN;
			goto out;
		}

		kvm_gmem_invalidate_begin(inode, start, end);

		mas_store_prealloc(&m2, xa_mk_value(attrs->attributes));

		kvm_gmem_invalidate_end(inode, start, end);
	}

> and allocations are a bit more complicated since we try to compact ranges
> with the same shared/private status into one same maple tree node. Still
> technically possible, maybe by updating a copy of the maple tree first, then
> swapping the current maple tree out atomically.

Heh, more complexity I'd prefer to avoid.

> With HugeTLB, undoing HVO needs pages to be allocated, will need more
> digging into the details to determine if preallocation could work.

It has to work, otherwise we're hosed.  Though it should be noted that "preallocate"
here just means before kvm_gmem_invalidate_begin().  Everything up to that point
can fail.

The other option in all of this is to add an API to _block_ stage-2 mappings, in
TDX terminology.  In TDX, BLOCK marks leaf S-EPT entries !PRESENT, but preserves
all the metadata.  So if we're too scared to do an after-the-fact invalidation,
and we need to support failure after kvm_gmem_invalidate_begin(), we could rework
the TDX backend to support e.g. kvm_gmem_invalidate_abort().  On begin(), KVM 
would BLOCK mappings; on abort(), restore; on end() fully remove.

> I'd still prefer having the option to return an error so that we don't
> paint ourselves into a corner.

Yep, my thoughts exactly.

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Yan Zhao 6 months, 1 week ago

On Wed, May 14, 2025 at 04:41:41PM -0700, Ackerley Tng wrote:
> +static enum shareability kvm_gmem_shareability_get(struct inode *inode,
> +						 pgoff_t index)
> +{
> +	struct maple_tree *mt;
> +	void *entry;
> +
> +	mt = &kvm_gmem_private(inode)->shareability;
> +	entry = mtree_load(mt, index);
> +	WARN(!entry,
> +	     "Shareability should always be defined for all indices in inode.");
> +
> +	return xa_to_value(entry);
> +}
> +
Hi Ackerley,

Not sure if it's a known issue. Just want to let you know in case you're unaware.

During a test to repeatedly launching/destroying TDs, I encountered a warning
from kvm_gmem_shareability_get() (see the attached log at the bottom).
The reproducing rate is 1 in every 20-100 times of launching TD.


After some analysis, I found that the warning was produced by
kvm_gmem_shareability_get() when it's called from kvm_gmem_is_private(), which
is not protected by any locks.

I can get rid of the warning by either fix 1 or fix 2 below.
(I prefer fix 1 though :))

fix 1:

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index e78fbebf4f53..136d46c5b2ab 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -2024,7 +2024,7 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,

 #ifdef CONFIG_KVM_GMEM_SHARED_MEM
        if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED) {
-               mt_init(&private->shareability);
+               mt_init_flags(&private->shareability, MT_FLAGS_USE_RCU);

                err = kvm_gmem_shareability_setup(private, size, flags);
                if (err)


fix 2:
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index e78fbebf4f53..9a4518104d56 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -171,7 +171,9 @@ static enum shareability kvm_gmem_shareability_get(struct inode *inode,
        void *entry;

        mt = &kvm_gmem_private(inode)->shareability;
+       mtree_lock(mt);
        entry = mtree_load(mt, index);
+       mtree_unlock(mt);
        WARN(!entry,
             "Shareability should always be defined for all indices in inode.");


Thanks
Yan

[  845.253021] ------------[ cut here ]------------
[  845.259236] Shareability should always be defined for all indices in inode.
[  845.259273] WARNING: CPU: 148 PID: 3775 at arch/x86/kvm/../../../virt/kvm/guest_memfd.c:175 kvm_gmem_shareability_get.isra.0+0x39/0x50 [kvm]
[  845.283330] Modules linked in: kvm_intel i2c_i801 idxd i2c_smbus i2c_ismt kvm irqbypass nls_iso8859_1 nls_cp437 squashfs ghash_clmulni_intel hid_generic aesni_intel
[  845.300914] CPU: 148 UID: 0 PID: 3775 Comm: qemu-system-x86 Tainted: G S                  6.16.0-rc6-upstream+ #520 PREEMPT(voluntary)  49e4d0c13b52dd8fe7006bbbb80b018c4576ab2d
[  845.319631] Tainted: [S]=CPU_OUT_OF_SPEC
[  845.324956] Hardware name: Intel Corporation ArcherCity/ArcherCity, BIOS EGSDCRB1.SYS.0101.D29.2303301937 03/30/2023
[  845.337749] RIP: 0010:kvm_gmem_shareability_get.isra.0+0x39/0x50 [kvm]
[  845.346085] Code: bf 48 02 00 00 e8 a7 d4 08 d1 48 85 c0 74 09 c9 48 d1 e8 c3 cc cc cc cc 48 89 45 f8 90 48 c7 c7 a0 56 5c c0 e8 68 3c b5 cf 90 <0f> 0b 90 90 48 8b 45 f8 c9 48 d1 e8 c3 cc cc cc cc 66 0f 1f 44 00
[  845.368227] RSP: 0018:ff29e9c2e336baa0 EFLAGS: 00010282
[  845.375038] RAX: 0000000000000000 RBX: 00000000001825d4 RCX: 0000000000000000
[  845.384020] RDX: 0000000000000002 RSI: 0000000000000001 RDI: 00000000ffffffff
[  845.392966] RBP: ff29e9c2e336baa8 R08: 0000000000000000 R09: 0000000000000000
[  845.401912] R10: 0000000000000001 R11: 0000000000000000 R12: ff1236f76e067a80
[  845.410878] R13: ff1236f76e0ecc00 R14: 0000000000000000 R15: ff1236f783af8000
[  845.419850] FS:  00007f8b863fc6c0(0000) GS:ff12370458883000(0000) knlGS:0000000000000000
[  845.429915] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  845.437304] CR2: 0000000000000000 CR3: 00000003e9989005 CR4: 0000000000773ef0
[  845.446265] PKRU: 55555554
[  845.450224] Call Trace:
[  845.453887]  <TASK>
[  845.457161]  kvm_gmem_is_private+0x4b/0x70 [kvm 6f655eadf3c2ae71b90b04a3d4ef5b799600c3f8]
[  845.467348]  kvm_mmu_faultin_pfn+0x14a/0x360 [kvm 6f655eadf3c2ae71b90b04a3d4ef5b799600c3f8]
[  845.477740]  kvm_tdp_page_fault+0x97/0xf0 [kvm 6f655eadf3c2ae71b90b04a3d4ef5b799600c3f8]
[  845.487843]  kvm_mmu_do_page_fault+0x23d/0x290 [kvm 6f655eadf3c2ae71b90b04a3d4ef5b799600c3f8]
[  845.505524]  ? __this_cpu_preempt_check+0x13/0x20
[  845.515349]  kvm_mmu_page_fault+0x8c/0x3d0 [kvm 6f655eadf3c2ae71b90b04a3d4ef5b799600c3f8]
[  845.529136]  tdx_handle_ept_violation+0x16a/0x310 [kvm_intel 1efe846cc4054cc289d319f1912cf040ec0ca0e6]
[  845.547760]  tdx_handle_exit+0x44f/0x540 [kvm_intel 1efe846cc4054cc289d319f1912cf040ec0ca0e6]
[  845.565647]  ? lock_acquire+0x52/0x70
[  845.574284]  ? vcpu_enter_guest+0x452/0x11d0 [kvm 6f655eadf3c2ae71b90b04a3d4ef5b799600c3f8]
[  845.591886]  vt_handle_exit+0x25/0x30 [kvm_intel 1efe846cc4054cc289d319f1912cf040ec0ca0e6]
[  845.609407]  vcpu_enter_guest+0x4b1/0x11d0 [kvm 6f655eadf3c2ae71b90b04a3d4ef5b799600c3f8]
[  845.623253]  ? kvm_apic_local_deliver+0x8a/0xe0 [kvm 6f655eadf3c2ae71b90b04a3d4ef5b799600c3f8]
[  845.641247]  vcpu_run+0x4d/0x280 [kvm 6f655eadf3c2ae71b90b04a3d4ef5b799600c3f8]
[  845.654096]  ? vcpu_run+0x4d/0x280 [kvm 6f655eadf3c2ae71b90b04a3d4ef5b799600c3f8]
[  845.667165]  kvm_arch_vcpu_ioctl_run+0x544/0x890 [kvm 6f655eadf3c2ae71b90b04a3d4ef5b799600c3f8]
[  845.685231]  kvm_vcpu_ioctl+0x143/0x7c0 [kvm 6f655eadf3c2ae71b90b04a3d4ef5b799600c3f8]
[  845.698810]  ? __fget_files+0xc2/0x1b0
[  845.707633]  ? __this_cpu_preempt_check+0x13/0x20
[  845.717555]  ? __fget_files+0xcc/0x1b0
[  845.726405]  __x64_sys_ioctl+0x9a/0xf0
[  845.735241]  ? __this_cpu_preempt_check+0x13/0x20
[  845.745163]  x64_sys_call+0x1054/0x20c0
[  845.754043]  do_syscall_64+0xc3/0x470
[  845.762701]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
[  845.772906] RIP: 0033:0x7f8d9c124ded
[  845.781398] Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00
[  845.814651] RSP: 002b:00007f8b863f7cd0 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[  845.827882] RAX: ffffffffffffffda RBX: 00007f8b863fccdc RCX: 00007f8d9c124ded
[  845.840591] RDX: 0000000000000000 RSI: 000000000000ae80 RDI: 000000000000001e
[  845.853201] RBP: 00007f8b863f7d20 R08: 0000000000000000 R09: 0000000000000000
[  845.865776] R10: 0000000000000000 R11: 0000000000000246 R12: 00007f8b863fc6c0
[  845.878246] R13: ffffffffffffdbf0 R14: 0000000000000007 R15: 00007ffedb593c00
[  845.890732]  </TASK>
[  845.897565] irq event stamp: 859157
[  845.905815] hardirqs last  enabled at (859171): [<ffffffff902447d3>] __up_console_sem+0x63/0x90
[  845.923321] hardirqs last disabled at (859184): [<ffffffff902447b8>] __up_console_sem+0x48/0x90
[  845.940892] softirqs last  enabled at (859126): [<ffffffff90194ef8>] handle_softirqs+0x358/0x4b0
[  845.958654] softirqs last disabled at (859207): [<ffffffff901951cf>] __irq_exit_rcu+0xef/0x170
[  845.976232] ---[ end trace 0000000000000000 ]---

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ackerley Tng 5 months, 3 weeks ago

Yan Zhao <yan.y.zhao@intel.com> writes:

> On Wed, May 14, 2025 at 04:41:41PM -0700, Ackerley Tng wrote:
>> +static enum shareability kvm_gmem_shareability_get(struct inode *inode,
>> +						 pgoff_t index)
>> +{
>> +	struct maple_tree *mt;
>> +	void *entry;
>> +
>> +	mt = &kvm_gmem_private(inode)->shareability;
>> +	entry = mtree_load(mt, index);
>> +	WARN(!entry,
>> +	     "Shareability should always be defined for all indices in inode.");
>> +
>> +	return xa_to_value(entry);
>> +}
>> +
> Hi Ackerley,
>
> Not sure if it's a known issue. Just want to let you know in case you're unaware.
>

Thanks for informing me, and thanks for the analysis :)

> During a test to repeatedly launching/destroying TDs, I encountered a warning
> from kvm_gmem_shareability_get() (see the attached log at the bottom).
> The reproducing rate is 1 in every 20-100 times of launching TD.
>
> After some analysis, I found that the warning was produced by
> kvm_gmem_shareability_get() when it's called from kvm_gmem_is_private(), which
> is not protected by any locks.
>
> I can get rid of the warning by either fix 1 or fix 2 below.
> (I prefer fix 1 though :))
>
> fix 1:
>
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index e78fbebf4f53..136d46c5b2ab 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -2024,7 +2024,7 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>
>  #ifdef CONFIG_KVM_GMEM_SHARED_MEM
>         if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED) {
> -               mt_init(&private->shareability);
> +               mt_init_flags(&private->shareability, MT_FLAGS_USE_RCU);
>
>                 err = kvm_gmem_shareability_setup(private, size, flags);
>                 if (err)
>

Not sure about the version of the conversion patch series that you're
using, in the version I'm preparing, I'm using
filemap_invalidate_lock_shared() to guard shareability
reads. filemap_invalidate_lock() is held during shareability updates, so
I think this issue should be fixed.

Please let me know if you're still seeing this issue in the next series
(coming soon). Thank you!

>
> fix 2:
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index e78fbebf4f53..9a4518104d56 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -171,7 +171,9 @@ static enum shareability kvm_gmem_shareability_get(struct inode *inode,
>         void *entry;
>
>         mt = &kvm_gmem_private(inode)->shareability;
> +       mtree_lock(mt);
>         entry = mtree_load(mt, index);
> +       mtree_unlock(mt);
>         WARN(!entry,
>              "Shareability should always be defined for all indices in inode.");
>
>
> Thanks
> Yan
>
> 
> [...snip...]
>

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Michael Roth 8 months, 2 weeks ago

On Wed, May 14, 2025 at 04:41:41PM -0700, Ackerley Tng wrote:
> Track guest_memfd memory's shareability status within the inode as
> opposed to the file, since it is property of the guest_memfd's memory
> contents.
> 
> Shareability is a property of the memory and is indexed using the
> page's index in the inode. Because shareability is the memory's
> property, it is stored within guest_memfd instead of within KVM, like
> in kvm->mem_attr_array.
> 
> KVM_MEMORY_ATTRIBUTE_PRIVATE in kvm->mem_attr_array must still be
> retained to allow VMs to only use guest_memfd for private memory and
> some other memory for shared memory.
> 
> Not all use cases require guest_memfd() to be shared with the host
> when first created. Add a new flag, GUEST_MEMFD_FLAG_INIT_PRIVATE,
> which when set on KVM_CREATE_GUEST_MEMFD, initializes the memory as
> private to the guest, and therefore not mappable by the
> host. Otherwise, memory is shared until explicitly converted to
> private.
> 
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> Co-developed-by: Fuad Tabba <tabba@google.com>
> Signed-off-by: Fuad Tabba <tabba@google.com>
> Change-Id: If03609cbab3ad1564685c85bdba6dcbb6b240c0f
> ---
>  Documentation/virt/kvm/api.rst |   5 ++
>  include/uapi/linux/kvm.h       |   2 +
>  virt/kvm/guest_memfd.c         | 124 ++++++++++++++++++++++++++++++++-
>  3 files changed, 129 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 86f74ce7f12a..f609337ae1c2 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
>  The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
>  This is validated when the guest_memfd instance is bound to the VM.
>  
> +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.  Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
> +will initialize the memory for the guest_memfd as guest-only and not faultable
> +by the host.
> +

KVM_CAP_GMEM_CONVERSION doesn't get introduced until later, so it seems
like this flag should be deferred until that patch is in place. Is it
really needed at that point though? Userspace would be able to set the
initial state via KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls.

The mtree contents seems to get stored in the same manner in either case so
performance-wise only the overhead of a few userspace<->kernel switches
would be saved. Are there any other reasons?

Otherwise, maybe just settle on SHARED as a documented default (since at
least non-CoCo VMs would be able to reliably benefit) and let
CoCo/GUEST_MEMFD_FLAG_SUPPORT_SHARED VMs set PRIVATE at whatever
granularity makes sense for the architecture/guest configuration.

>  See KVM_SET_USER_MEMORY_REGION2 for additional details.
>  
>  4.143 KVM_PRE_FAULT_MEMORY
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 4cc824a3a7c9..d7df312479aa 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1567,7 +1567,9 @@ struct kvm_memory_attributes {
>  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
>  
>  #define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
> +
>  #define GUEST_MEMFD_FLAG_SUPPORT_SHARED	(1UL << 0)
> +#define GUEST_MEMFD_FLAG_INIT_PRIVATE	(1UL << 1)
>  
>  struct kvm_create_guest_memfd {
>  	__u64 size;
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 239d0f13dcc1..590932499eba 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -4,6 +4,7 @@
>  #include <linux/falloc.h>
>  #include <linux/fs.h>
>  #include <linux/kvm_host.h>
> +#include <linux/maple_tree.h>
>  #include <linux/pseudo_fs.h>
>  #include <linux/pagemap.h>
>  
> @@ -17,6 +18,24 @@ struct kvm_gmem {
>  	struct list_head entry;
>  };
>  
> +struct kvm_gmem_inode_private {
> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> +	struct maple_tree shareability;
> +#endif
> +};
> +
> +enum shareability {
> +	SHAREABILITY_GUEST = 1,	/* Only the guest can map (fault) folios in this range. */
> +	SHAREABILITY_ALL = 2,	/* Both guest and host can fault folios in this range. */
> +};
> +
> +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
> +
> +static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
> +{
> +	return inode->i_mapping->i_private_data;
> +}
> +
>  /**
>   * folio_file_pfn - like folio_file_page, but return a pfn.
>   * @folio: The folio which contains this index.
> @@ -29,6 +48,58 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
>  	return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
>  }
>  
> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> +
> +static int kvm_gmem_shareability_setup(struct kvm_gmem_inode_private *private,
> +				      loff_t size, u64 flags)
> +{
> +	enum shareability m;
> +	pgoff_t last;
> +
> +	last = (size >> PAGE_SHIFT) - 1;
> +	m = flags & GUEST_MEMFD_FLAG_INIT_PRIVATE ? SHAREABILITY_GUEST :
> +						    SHAREABILITY_ALL;
> +	return mtree_store_range(&private->shareability, 0, last, xa_mk_value(m),
> +				 GFP_KERNEL);

One really nice thing about using a maple tree is that it should get rid
of a fairly significant startup delay for SNP/TDX when the entire xarray gets
initialized with private attribute entries via KVM_SET_MEMORY_ATTRIBUTES
(which is the current QEMU default behavior).

I'd originally advocated for sticking with the xarray implementation Fuad was
using until we'd determined we really need it for HugeTLB support, but I'm
sort of thinking it's already justified just based on the above.

Maybe it would make sense for KVM memory attributes too?

> +}
> +
> +static enum shareability kvm_gmem_shareability_get(struct inode *inode,
> +						 pgoff_t index)
> +{
> +	struct maple_tree *mt;
> +	void *entry;
> +
> +	mt = &kvm_gmem_private(inode)->shareability;
> +	entry = mtree_load(mt, index);
> +	WARN(!entry,
> +	     "Shareability should always be defined for all indices in inode.");
> +
> +	return xa_to_value(entry);
> +}
> +
> +static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> +{
> +	if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
> +		return ERR_PTR(-EACCES);
> +
> +	return kvm_gmem_get_folio(inode, index);
> +}
> +
> +#else
> +
> +static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
> +{
> +	return 0;
> +}
> +
> +static inline struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> +{
> +	WARN_ONCE("Unexpected call to get shared folio.")
> +	return NULL;
> +}
> +
> +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
> +
>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>  				    pgoff_t index, struct folio *folio)
>  {
> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
>  
>  	filemap_invalidate_lock_shared(inode->i_mapping);
>  
> -	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> +	folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
>  	if (IS_ERR(folio)) {
>  		int err = PTR_ERR(folio);
>  
> @@ -420,8 +491,33 @@ static struct file_operations kvm_gmem_fops = {
>  	.fallocate	= kvm_gmem_fallocate,
>  };
>  
> +static void kvm_gmem_free_inode(struct inode *inode)
> +{
> +	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> +
> +	kfree(private);
> +
> +	free_inode_nonrcu(inode);
> +}
> +
> +static void kvm_gmem_destroy_inode(struct inode *inode)
> +{
> +	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> +
> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> +	/*
> +	 * mtree_destroy() can't be used within rcu callback, hence can't be
> +	 * done in ->free_inode().
> +	 */
> +	if (private)
> +		mtree_destroy(&private->shareability);
> +#endif
> +}
> +
>  static const struct super_operations kvm_gmem_super_operations = {
>  	.statfs		= simple_statfs,
> +	.destroy_inode	= kvm_gmem_destroy_inode,
> +	.free_inode	= kvm_gmem_free_inode,
>  };
>  
>  static int kvm_gmem_init_fs_context(struct fs_context *fc)
> @@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
>  static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>  						      loff_t size, u64 flags)
>  {
> +	struct kvm_gmem_inode_private *private;
>  	struct inode *inode;
> +	int err;
>  
>  	inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
>  	if (IS_ERR(inode))
>  		return inode;
>  
> +	err = -ENOMEM;
> +	private = kzalloc(sizeof(*private), GFP_KERNEL);
> +	if (!private)
> +		goto out;
> +
> +	mt_init(&private->shareability);
> +	inode->i_mapping->i_private_data = private;
> +
> +	err = kvm_gmem_shareability_setup(private, size, flags);
> +	if (err)
> +		goto out;
> +
>  	inode->i_private = (void *)(unsigned long)flags;
>  	inode->i_op = &kvm_gmem_iops;
>  	inode->i_mapping->a_ops = &kvm_gmem_aops;
> @@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>  	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
>  
>  	return inode;
> +
> +out:
> +	iput(inode);
> +
> +	return ERR_PTR(err);
>  }
>  
>  static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
> @@ -654,6 +769,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
>  	if (kvm_arch_vm_supports_gmem_shared_mem(kvm))
>  		valid_flags |= GUEST_MEMFD_FLAG_SUPPORT_SHARED;
>  
> +	if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED)
> +		valid_flags |= GUEST_MEMFD_FLAG_INIT_PRIVATE;
> +
>  	if (flags & ~valid_flags)
>  		return -EINVAL;
>  
> @@ -842,6 +960,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	if (!file)
>  		return -EFAULT;
>  
> +	filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
> +

I like the idea of using a write-lock/read-lock to protect write/read access
to shareability state (though maybe not necessarily re-using filemap's
invalidate lock), it's simple and still allows concurrent faulting in of gmem
pages. One issue on the SNP side (which also came up in one of the gmem calls)
is if we introduce support for tracking preparedness as discussed (e.g. via a
new SHAREABILITY_GUEST_PREPARED state) the
SHAREABILITY_GUEST->SHAREABILITY_GUEST_PREPARED transition would occur at
fault-time, and so would need to take the write-lock and no longer allow for
concurrent fault-handling.

I was originally planning on introducing a new rw_semaphore with similar
semantics to the rw_lock that Fuad previously had in his restricted mmap
series[1] (and simiar semantics to filemap invalidate lock here). The main
difference, to handle setting SHAREABILITY_GUEST_PREPARED within fault paths,
was that in the case of a folio being present for an index, the folio lock would
also need to be held in order to update the shareability state. Because
of that, fault paths (which will always either have or allocate folio
basically) can rely on the folio lock to guard shareability state in a more
granular way and so can avoid a global write lock.

They would still need to hold the read lock to access the tree however.
Or more specifically, any paths that could allocate a folio need to take
a read lock so there isn't a TOCTOU situation where shareability is
being updated for an index for which a folio hasn't been allocated, but
then just afterward the folio gets faulted in/allocated while the
shareability state is already being updated which the understand that
there was no folio around that needed locking.

I had a branch with in-place conversion support for SNP[2] that added this
lock reworking on top of Fuad's series along with preparation tracking,
but I'm now planning to rebase that on top of the patches from this
series that Sean mentioned[3] earlier:

  KVM: guest_memfd: Add CAP KVM_CAP_GMEM_CONVERSION
  KVM: Query guest_memfd for private/shared status
  KVM: guest_memfd: Skip LRU for guest_memfd folios
  KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
  KVM: guest_memfd: Introduce and use shareability to guard faulting
  KVM: guest_memfd: Make guest mem use guest mem inodes instead of anonymous inodes

but figured I'd mention it here in case there are other things to consider on
the locking front.

Definitely agree with Sean though that it would be nice to start identifying a
common base of patches for the in-place conversion enablement for SNP, TDX, and
pKVM so the APIs/interfaces for hugepages can be handled separately.

-Mike

[1] https://lore.kernel.org/kvm/20250328153133.3504118-1-tabba@google.com/
[2] https://github.com/mdroth/linux/commits/mmap-swprot-v10-snp0-wip2/
[3] https://lore.kernel.org/kvm/aC86OsU2HSFZkJP6@google.com/

>  	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
>  	if (IS_ERR(folio)) {
>  		r = PTR_ERR(folio);
> @@ -857,8 +977,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>  		*page = folio_file_page(folio, index);
>  	else
>  		folio_put(folio);
> -
>  out:
> +	filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
>  	fput(file);
>  	return r;
>  }
> -- 
> 2.49.0.1045.g170613ef41-goog
>

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ackerley Tng 8 months ago

Michael Roth <michael.roth@amd.com> writes:

> On Wed, May 14, 2025 at 04:41:41PM -0700, Ackerley Tng wrote:

Missed out responses on the second two comments!

[...]

>> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
>> +
>> +static int kvm_gmem_shareability_setup(struct kvm_gmem_inode_private *private,
>> +				      loff_t size, u64 flags)
>> +{
>> +	enum shareability m;
>> +	pgoff_t last;
>> +
>> +	last = (size >> PAGE_SHIFT) - 1;
>> +	m = flags & GUEST_MEMFD_FLAG_INIT_PRIVATE ? SHAREABILITY_GUEST :
>> +						    SHAREABILITY_ALL;
>> +	return mtree_store_range(&private->shareability, 0, last, xa_mk_value(m),
>> +				 GFP_KERNEL);
>
> One really nice thing about using a maple tree is that it should get rid
> of a fairly significant startup delay for SNP/TDX when the entire xarray gets
> initialized with private attribute entries via KVM_SET_MEMORY_ATTRIBUTES
> (which is the current QEMU default behavior).
>
> I'd originally advocated for sticking with the xarray implementation Fuad was
> using until we'd determined we really need it for HugeTLB support, but I'm
> sort of thinking it's already justified just based on the above.
>

We discussed this at the guest_memfd upstream call, and I believe the
current position is to go with maple_trees. Thanks for bringing this up!

> Maybe it would make sense for KVM memory attributes too?
>

I think so, but I haven't had the chance to work on that.

>> +}
>> +

[...]

>> @@ -842,6 +960,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>>  	if (!file)
>>  		return -EFAULT;
>>  
>> +	filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
>> +

In this RFC, the filemap_invalidate_lock() was basically used to
serialize everything that could modify shareability.

>
> I like the idea of using a write-lock/read-lock to protect write/read access
> to shareability state (though maybe not necessarily re-using filemap's
> invalidate lock), it's simple and still allows concurrent faulting in of gmem
> pages. One issue on the SNP side (which also came up in one of the gmem calls)
> is if we introduce support for tracking preparedness as discussed (e.g. via a
> new SHAREABILITY_GUEST_PREPARED state) the
> SHAREABILITY_GUEST->SHAREABILITY_GUEST_PREPARED transition would occur at
> fault-time, and so would need to take the write-lock and no longer allow for
> concurrent fault-handling.
>
> I was originally planning on introducing a new rw_semaphore with similar
> semantics to the rw_lock that Fuad previously had in his restricted mmap
> series[1] (and simiar semantics to filemap invalidate lock here). The main
> difference, to handle setting SHAREABILITY_GUEST_PREPARED within fault paths,
> was that in the case of a folio being present for an index, the folio lock would
> also need to be held in order to update the shareability state. Because
> of that, fault paths (which will always either have or allocate folio
> basically) can rely on the folio lock to guard shareability state in a more
> granular way and so can avoid a global write lock.
>
> They would still need to hold the read lock to access the tree however.
> Or more specifically, any paths that could allocate a folio need to take
> a read lock so there isn't a TOCTOU situation where shareability is
> being updated for an index for which a folio hasn't been allocated, but
> then just afterward the folio gets faulted in/allocated while the
> shareability state is already being updated which the understand that
> there was no folio around that needed locking.
>
> I had a branch with in-place conversion support for SNP[2] that added this
> lock reworking on top of Fuad's series along with preparation tracking,
> but I'm now planning to rebase that on top of the patches from this
> series that Sean mentioned[3] earlier:
>
>   KVM: guest_memfd: Add CAP KVM_CAP_GMEM_CONVERSION
>   KVM: Query guest_memfd for private/shared status
>   KVM: guest_memfd: Skip LRU for guest_memfd folios
>   KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
>   KVM: guest_memfd: Introduce and use shareability to guard faulting
>   KVM: guest_memfd: Make guest mem use guest mem inodes instead of anonymous inodes
>
> but figured I'd mention it here in case there are other things to consider on
> the locking front.

We discussed this a little at the last guest_memfd call: I'll summarize
the question I raised during the call here in text. :)

Today in guest_memfd the "prepared" and "zeroed" concepts are tracked
with the folio's uptodate flag.

Preparation is only used by SNP today and TDX does the somewhat
equivalent "preparation" at time of mapping into the guest page table.

Can we do SNP's preparation at some other point in time and not let the
"prepared" state be handled by guest_memfd at all?

This might simplify locking too, so preparedness would be locked
whenever SNP needs to, independently of shareability tracking.

Also, this might simplify the routines that use kvm_gmem_populate(),
perhaps remove the need for kvm_gmem_populate()? The current callers are
basically using kvm_gmem_populate() to allocate pages, why not call
kvm_gmem_get_folio() to do the allocation?

Another tangential point: it's hard to use the uptodate flag for
tracking preparedness, since when there are huge pages, the uptodate
flag can only indicate if the entire folio is prepared, but a user of
the memory might only have part of the folio prepared.

>
> Definitely agree with Sean though that it would be nice to start identifying a
> common base of patches for the in-place conversion enablement for SNP, TDX, and
> pKVM so the APIs/interfaces for hugepages can be handled separately.
>
> -Mike
>
> [1] https://lore.kernel.org/kvm/20250328153133.3504118-1-tabba@google.com/
> [2] https://github.com/mdroth/linux/commits/mmap-swprot-v10-snp0-wip2/
> [3] https://lore.kernel.org/kvm/aC86OsU2HSFZkJP6@google.com/
>
>>  	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
>>  	if (IS_ERR(folio)) {
>>  		r = PTR_ERR(folio);
>> @@ -857,8 +977,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>>  		*page = folio_file_page(folio, index);
>>  	else
>>  		folio_put(folio);
>> -
>>  out:
>> +	filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
>>  	fput(file);
>>  	return r;
>>  }
>> -- 
>> 2.49.0.1045.g170613ef41-goog
>>

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ackerley Tng 8 months ago

Michael Roth <michael.roth@amd.com> writes:

> On Wed, May 14, 2025 at 04:41:41PM -0700, Ackerley Tng wrote:
>> Track guest_memfd memory's shareability status within the inode as
>> opposed to the file, since it is property of the guest_memfd's memory
>> contents.
>> 
>> Shareability is a property of the memory and is indexed using the
>> page's index in the inode. Because shareability is the memory's
>> property, it is stored within guest_memfd instead of within KVM, like
>> in kvm->mem_attr_array.
>> 
>> KVM_MEMORY_ATTRIBUTE_PRIVATE in kvm->mem_attr_array must still be
>> retained to allow VMs to only use guest_memfd for private memory and
>> some other memory for shared memory.
>> 
>> Not all use cases require guest_memfd() to be shared with the host
>> when first created. Add a new flag, GUEST_MEMFD_FLAG_INIT_PRIVATE,
>> which when set on KVM_CREATE_GUEST_MEMFD, initializes the memory as
>> private to the guest, and therefore not mappable by the
>> host. Otherwise, memory is shared until explicitly converted to
>> private.
>> 
>> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
>> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
>> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
>> Co-developed-by: Fuad Tabba <tabba@google.com>
>> Signed-off-by: Fuad Tabba <tabba@google.com>
>> Change-Id: If03609cbab3ad1564685c85bdba6dcbb6b240c0f
>> ---
>>  Documentation/virt/kvm/api.rst |   5 ++
>>  include/uapi/linux/kvm.h       |   2 +
>>  virt/kvm/guest_memfd.c         | 124 ++++++++++++++++++++++++++++++++-
>>  3 files changed, 129 insertions(+), 2 deletions(-)
>> 
>> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
>> index 86f74ce7f12a..f609337ae1c2 100644
>> --- a/Documentation/virt/kvm/api.rst
>> +++ b/Documentation/virt/kvm/api.rst
>> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
>>  The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
>>  This is validated when the guest_memfd instance is bound to the VM.
>>  
>> +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
>> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.  Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
>> +will initialize the memory for the guest_memfd as guest-only and not faultable
>> +by the host.
>> +
>
> KVM_CAP_GMEM_CONVERSION doesn't get introduced until later, so it seems
> like this flag should be deferred until that patch is in place. Is it
> really needed at that point though? Userspace would be able to set the
> initial state via KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls.
>

I can move this change to the later patch. Thanks! Will fix in the next
revision.

> The mtree contents seems to get stored in the same manner in either case so
> performance-wise only the overhead of a few userspace<->kernel switches
> would be saved. Are there any other reasons?
>
> Otherwise, maybe just settle on SHARED as a documented default (since at
> least non-CoCo VMs would be able to reliably benefit) and let
> CoCo/GUEST_MEMFD_FLAG_SUPPORT_SHARED VMs set PRIVATE at whatever
> granularity makes sense for the architecture/guest configuration.
>

Because shared pages are split once any memory is allocated, having a
way to INIT_PRIVATE could avoid the split and then merge on
conversion. I feel that is enough value to have this config flag, what
do you think?

I guess we could also have userspace be careful not to do any allocation
before converting.

>>  See KVM_SET_USER_MEMORY_REGION2 for additional details.
>>  
>>  4.143 KVM_PRE_FAULT_MEMORY
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 4cc824a3a7c9..d7df312479aa 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -1567,7 +1567,9 @@ struct kvm_memory_attributes {
>>  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
>>  
>>  #define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
>> +
>>  #define GUEST_MEMFD_FLAG_SUPPORT_SHARED	(1UL << 0)
>> +#define GUEST_MEMFD_FLAG_INIT_PRIVATE	(1UL << 1)
>>  
>>  struct kvm_create_guest_memfd {
>>  	__u64 size;
>> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>> index 239d0f13dcc1..590932499eba 100644
>> --- a/virt/kvm/guest_memfd.c
>> +++ b/virt/kvm/guest_memfd.c
>> @@ -4,6 +4,7 @@
>>  #include <linux/falloc.h>
>>  #include <linux/fs.h>
>>  #include <linux/kvm_host.h>
>> +#include <linux/maple_tree.h>
>>  #include <linux/pseudo_fs.h>
>>  #include <linux/pagemap.h>
>>  
>> @@ -17,6 +18,24 @@ struct kvm_gmem {
>>  	struct list_head entry;
>>  };
>>  
>> +struct kvm_gmem_inode_private {
>> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
>> +	struct maple_tree shareability;
>> +#endif
>> +};
>> +
>> +enum shareability {
>> +	SHAREABILITY_GUEST = 1,	/* Only the guest can map (fault) folios in this range. */
>> +	SHAREABILITY_ALL = 2,	/* Both guest and host can fault folios in this range. */
>> +};
>> +
>> +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
>> +
>> +static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
>> +{
>> +	return inode->i_mapping->i_private_data;
>> +}
>> +
>>  /**
>>   * folio_file_pfn - like folio_file_page, but return a pfn.
>>   * @folio: The folio which contains this index.
>> @@ -29,6 +48,58 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
>>  	return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
>>  }
>>  
>> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
>> +
>> +static int kvm_gmem_shareability_setup(struct kvm_gmem_inode_private *private,
>> +				      loff_t size, u64 flags)
>> +{
>> +	enum shareability m;
>> +	pgoff_t last;
>> +
>> +	last = (size >> PAGE_SHIFT) - 1;
>> +	m = flags & GUEST_MEMFD_FLAG_INIT_PRIVATE ? SHAREABILITY_GUEST :
>> +						    SHAREABILITY_ALL;
>> +	return mtree_store_range(&private->shareability, 0, last, xa_mk_value(m),
>> +				 GFP_KERNEL);
>
> One really nice thing about using a maple tree is that it should get rid
> of a fairly significant startup delay for SNP/TDX when the entire xarray gets
> initialized with private attribute entries via KVM_SET_MEMORY_ATTRIBUTES
> (which is the current QEMU default behavior).
>
> I'd originally advocated for sticking with the xarray implementation Fuad was
> using until we'd determined we really need it for HugeTLB support, but I'm
> sort of thinking it's already justified just based on the above.
>
> Maybe it would make sense for KVM memory attributes too?
>
>> +}
>> +
>> +static enum shareability kvm_gmem_shareability_get(struct inode *inode,
>> +						 pgoff_t index)
>> +{
>> +	struct maple_tree *mt;
>> +	void *entry;
>> +
>> +	mt = &kvm_gmem_private(inode)->shareability;
>> +	entry = mtree_load(mt, index);
>> +	WARN(!entry,
>> +	     "Shareability should always be defined for all indices in inode.");
>> +
>> +	return xa_to_value(entry);
>> +}
>> +
>> +static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
>> +{
>> +	if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
>> +		return ERR_PTR(-EACCES);
>> +
>> +	return kvm_gmem_get_folio(inode, index);
>> +}
>> +
>> +#else
>> +
>> +static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
>> +{
>> +	WARN_ONCE("Unexpected call to get shared folio.")
>> +	return NULL;
>> +}
>> +
>> +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
>> +
>>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>>  				    pgoff_t index, struct folio *folio)
>>  {
>> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
>>  
>>  	filemap_invalidate_lock_shared(inode->i_mapping);
>>  
>> -	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
>> +	folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
>>  	if (IS_ERR(folio)) {
>>  		int err = PTR_ERR(folio);
>>  
>> @@ -420,8 +491,33 @@ static struct file_operations kvm_gmem_fops = {
>>  	.fallocate	= kvm_gmem_fallocate,
>>  };
>>  
>> +static void kvm_gmem_free_inode(struct inode *inode)
>> +{
>> +	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
>> +
>> +	kfree(private);
>> +
>> +	free_inode_nonrcu(inode);
>> +}
>> +
>> +static void kvm_gmem_destroy_inode(struct inode *inode)
>> +{
>> +	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
>> +
>> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
>> +	/*
>> +	 * mtree_destroy() can't be used within rcu callback, hence can't be
>> +	 * done in ->free_inode().
>> +	 */
>> +	if (private)
>> +		mtree_destroy(&private->shareability);
>> +#endif
>> +}
>> +
>>  static const struct super_operations kvm_gmem_super_operations = {
>>  	.statfs		= simple_statfs,
>> +	.destroy_inode	= kvm_gmem_destroy_inode,
>> +	.free_inode	= kvm_gmem_free_inode,
>>  };
>>  
>>  static int kvm_gmem_init_fs_context(struct fs_context *fc)
>> @@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
>>  static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>>  						      loff_t size, u64 flags)
>>  {
>> +	struct kvm_gmem_inode_private *private;
>>  	struct inode *inode;
>> +	int err;
>>  
>>  	inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
>>  	if (IS_ERR(inode))
>>  		return inode;
>>  
>> +	err = -ENOMEM;
>> +	private = kzalloc(sizeof(*private), GFP_KERNEL);
>> +	if (!private)
>> +		goto out;
>> +
>> +	mt_init(&private->shareability);
>> +	inode->i_mapping->i_private_data = private;
>> +
>> +	err = kvm_gmem_shareability_setup(private, size, flags);
>> +	if (err)
>> +		goto out;
>> +
>>  	inode->i_private = (void *)(unsigned long)flags;
>>  	inode->i_op = &kvm_gmem_iops;
>>  	inode->i_mapping->a_ops = &kvm_gmem_aops;
>> @@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>>  	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
>>  
>>  	return inode;
>> +
>> +out:
>> +	iput(inode);
>> +
>> +	return ERR_PTR(err);
>>  }
>>  
>>  static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
>> @@ -654,6 +769,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
>>  	if (kvm_arch_vm_supports_gmem_shared_mem(kvm))
>>  		valid_flags |= GUEST_MEMFD_FLAG_SUPPORT_SHARED;
>>  
>> +	if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED)
>> +		valid_flags |= GUEST_MEMFD_FLAG_INIT_PRIVATE;
>> +
>>  	if (flags & ~valid_flags)
>>  		return -EINVAL;
>>  
>> @@ -842,6 +960,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>>  	if (!file)
>>  		return -EFAULT;
>>  
>> +	filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
>> +
>
> I like the idea of using a write-lock/read-lock to protect write/read access
> to shareability state (though maybe not necessarily re-using filemap's
> invalidate lock), it's simple and still allows concurrent faulting in of gmem
> pages. One issue on the SNP side (which also came up in one of the gmem calls)
> is if we introduce support for tracking preparedness as discussed (e.g. via a
> new SHAREABILITY_GUEST_PREPARED state) the
> SHAREABILITY_GUEST->SHAREABILITY_GUEST_PREPARED transition would occur at
> fault-time, and so would need to take the write-lock and no longer allow for
> concurrent fault-handling.
>
> I was originally planning on introducing a new rw_semaphore with similar
> semantics to the rw_lock that Fuad previously had in his restricted mmap
> series[1] (and simiar semantics to filemap invalidate lock here). The main
> difference, to handle setting SHAREABILITY_GUEST_PREPARED within fault paths,
> was that in the case of a folio being present for an index, the folio lock would
> also need to be held in order to update the shareability state. Because
> of that, fault paths (which will always either have or allocate folio
> basically) can rely on the folio lock to guard shareability state in a more
> granular way and so can avoid a global write lock.
>
> They would still need to hold the read lock to access the tree however.
> Or more specifically, any paths that could allocate a folio need to take
> a read lock so there isn't a TOCTOU situation where shareability is
> being updated for an index for which a folio hasn't been allocated, but
> then just afterward the folio gets faulted in/allocated while the
> shareability state is already being updated which the understand that
> there was no folio around that needed locking.
>
> I had a branch with in-place conversion support for SNP[2] that added this
> lock reworking on top of Fuad's series along with preparation tracking,
> but I'm now planning to rebase that on top of the patches from this
> series that Sean mentioned[3] earlier:
>
>   KVM: guest_memfd: Add CAP KVM_CAP_GMEM_CONVERSION
>   KVM: Query guest_memfd for private/shared status
>   KVM: guest_memfd: Skip LRU for guest_memfd folios
>   KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
>   KVM: guest_memfd: Introduce and use shareability to guard faulting
>   KVM: guest_memfd: Make guest mem use guest mem inodes instead of anonymous inodes
>
> but figured I'd mention it here in case there are other things to consider on
> the locking front.
>
> Definitely agree with Sean though that it would be nice to start identifying a
> common base of patches for the in-place conversion enablement for SNP, TDX, and
> pKVM so the APIs/interfaces for hugepages can be handled separately.
>
> -Mike
>
> [1] https://lore.kernel.org/kvm/20250328153133.3504118-1-tabba@google.com/
> [2] https://github.com/mdroth/linux/commits/mmap-swprot-v10-snp0-wip2/
> [3] https://lore.kernel.org/kvm/aC86OsU2HSFZkJP6@google.com/
>
>>  	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
>>  	if (IS_ERR(folio)) {
>>  		r = PTR_ERR(folio);
>> @@ -857,8 +977,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>>  		*page = folio_file_page(folio, index);
>>  	else
>>  		folio_put(folio);
>> -
>>  out:
>> +	filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
>>  	fput(file);
>>  	return r;
>>  }
>> -- 
>> 2.49.0.1045.g170613ef41-goog
>>

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Michael Roth 7 months, 1 week ago

On Wed, Jun 11, 2025 at 02:51:38PM -0700, Ackerley Tng wrote:
> Michael Roth <michael.roth@amd.com> writes:
> 
> > On Wed, May 14, 2025 at 04:41:41PM -0700, Ackerley Tng wrote:
> >> Track guest_memfd memory's shareability status within the inode as
> >> opposed to the file, since it is property of the guest_memfd's memory
> >> contents.
> >> 
> >> Shareability is a property of the memory and is indexed using the
> >> page's index in the inode. Because shareability is the memory's
> >> property, it is stored within guest_memfd instead of within KVM, like
> >> in kvm->mem_attr_array.
> >> 
> >> KVM_MEMORY_ATTRIBUTE_PRIVATE in kvm->mem_attr_array must still be
> >> retained to allow VMs to only use guest_memfd for private memory and
> >> some other memory for shared memory.
> >> 
> >> Not all use cases require guest_memfd() to be shared with the host
> >> when first created. Add a new flag, GUEST_MEMFD_FLAG_INIT_PRIVATE,
> >> which when set on KVM_CREATE_GUEST_MEMFD, initializes the memory as
> >> private to the guest, and therefore not mappable by the
> >> host. Otherwise, memory is shared until explicitly converted to
> >> private.
> >> 
> >> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> >> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> >> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> >> Co-developed-by: Fuad Tabba <tabba@google.com>
> >> Signed-off-by: Fuad Tabba <tabba@google.com>
> >> Change-Id: If03609cbab3ad1564685c85bdba6dcbb6b240c0f
> >> ---
> >>  Documentation/virt/kvm/api.rst |   5 ++
> >>  include/uapi/linux/kvm.h       |   2 +
> >>  virt/kvm/guest_memfd.c         | 124 ++++++++++++++++++++++++++++++++-
> >>  3 files changed, 129 insertions(+), 2 deletions(-)
> >> 
> >> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> >> index 86f74ce7f12a..f609337ae1c2 100644
> >> --- a/Documentation/virt/kvm/api.rst
> >> +++ b/Documentation/virt/kvm/api.rst
> >> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
> >>  The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
> >>  This is validated when the guest_memfd instance is bound to the VM.
> >>  
> >> +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
> >> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.  Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
> >> +will initialize the memory for the guest_memfd as guest-only and not faultable
> >> +by the host.
> >> +
> >
> > KVM_CAP_GMEM_CONVERSION doesn't get introduced until later, so it seems
> > like this flag should be deferred until that patch is in place. Is it
> > really needed at that point though? Userspace would be able to set the
> > initial state via KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls.
> >
> 
> I can move this change to the later patch. Thanks! Will fix in the next
> revision.
> 
> > The mtree contents seems to get stored in the same manner in either case so
> > performance-wise only the overhead of a few userspace<->kernel switches
> > would be saved. Are there any other reasons?
> >
> > Otherwise, maybe just settle on SHARED as a documented default (since at
> > least non-CoCo VMs would be able to reliably benefit) and let
> > CoCo/GUEST_MEMFD_FLAG_SUPPORT_SHARED VMs set PRIVATE at whatever
> > granularity makes sense for the architecture/guest configuration.
> >
> 
> Because shared pages are split once any memory is allocated, having a
> way to INIT_PRIVATE could avoid the split and then merge on
> conversion. I feel that is enough value to have this config flag, what
> do you think?
> 
> I guess we could also have userspace be careful not to do any allocation
> before converting.

I assume we do want to support things like preallocating guest memory so
not sure this approach is feasible to avoid splits.

But I feel like we might be working around a deeper issue here, which is
that we are pre-emptively splitting anything that *could* be mapped into
userspace (i.e. allocated+shared/mixed), rather than splitting when
necessary.

I know that was the plan laid out in the guest_memfd calls, but I've run
into a couple instances that have me thinking we should revisit this.

1) Some of the recent guest_memfd seems to be gravitating towards having
   userspace populate/initialize guest memory payload prior to boot via
   mmap()'ing the shared guest_memfd pages so things work the same as
   they would for initialized normal VM memory payload (rather than
   relying on back-channels in the kernel to user data into guest_memfd
   pages).

   When you do this though, for an SNP guest at least, that memory
   acceptance is done in chunks of 4MB (with accept_memory=lazy), and
   because that will put each 1GB page into an allocated+mixed state,
   we end up splitting every 1GB to 4K and the guest can't even
   accept/PVALIDATE it 2MB at that point even if userspace doesn't touch
   anything in the range. As some point the guest will convert/accept
   the entire range, at which point we could merge, but for SNP we'd
   need guest cooperation to actually use a higher-granularity in stage2
   page tables at that point since RMP entries are effectively all split
   to 4K.

   I understand the intent is to default to private where this wouldn't
   be an issue, and we could punt to userspace to deal with it, but it
   feels like an artificial restriction to place on userspace. And if we
   do want to allow/expect guest_memfd contents to be initialized pre-boot
   just like normal memory, then userspace would need to jump through
   some hoops:

   - if defaulting to private: add hooks to convert each range that's being
     modified to a shared state prior to writing to it
   - if defaulting to shared: initialize memory in-place, then covert
     everything else to private to avoid unecessarily splitting folios
     at run-time

   It feels like implementations details are bleeding out into the API
   to some degree here (e.g. we'd probably at least need to document
   this so users know how to take proper advantage of hugepage support).

2) There are some use-cases for HugeTLB + CoCo that have come to my
   attention recently that put a lot of weight on still being able to
   maximize mapping/hugepage size when accessing shared mem from userspace,
   e.g. for certain DPDK workloads that accessed shared guest buffers
   from host userspace. We don't really have a story for this, and I
   wouldn't expect us to at this stage, but I think it ties into #1 so
   might be worth considering in that context.

I'm still fine with the current approach as a starting point, but I'm
wondering if improving both #1/#2 might not be so bad and maybe even
give us some more flexibility (for instance, Sean had mentioned leaving
open the option of tracking more than just shareability/mappability, and
if there is split/merge logic associated with those transitions then
re-scanning each of these attributes for a 1G range seems like it could
benefit from some sort of intermediate data structure to help determine
things like what mapping granularity is available for guest/userspace
for a particular range.

One approach I was thinking of was that we introduce a data structure
similar to KVM's memslot->arch.lpage_info() where we store information
about what 1G/2M ranges are shared/private/mixed, and then instead of
splitting ahead of time we just record that state into this data
structure (using the same write lock as with the
shareability/mappability state), and then at *fault* time we split the
folio if our lpage_info-like data structure says the range is mixed.

Then, if guest converts a 2M/4M range to private while lazilly-accepting
(for instance), we can still keep the folio intact as 1GB, but mark
the 1G range in the lpage_info-like data structure as mixed so that we
still inform KVM/etc. they need to map it as 2MB or lower in stage2
page tables. In that case, even at guest fault-time, we can leave the
folio unsplit until userspace tries to touch it (though in most cases
it never will and we can keep most of the guest's 1G intact for the
duration of its lifetime).

On the userspace side, another nice thing there is if we see 1G is in a
mixed state, but 2M is all-shared, then we can still leave the folio as 2M,
and I think the refcount'ing logic would still work for the most part,
which makes #2 a bit easier to implement as well.

And of course, we wouldn't need the INIT_PRIVATE then since we are only
splitting when necessary.

But I guess this all comes down to how much extra pain there is in
tracking a 1G folio that's been split into a mixed of 2MB/4K regions,
but I think we'd get a lot more mileage out of getting that working and
just completely stripping out all of the merging logic for initial
implementation (other than at cleanup time), so maybe complexity-wise
it balances out a bit?

Thanks,

Mike

> 
> >>  See KVM_SET_USER_MEMORY_REGION2 for additional details.
> >>  
> >>  4.143 KVM_PRE_FAULT_MEMORY
> >> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> >> index 4cc824a3a7c9..d7df312479aa 100644
> >> --- a/include/uapi/linux/kvm.h
> >> +++ b/include/uapi/linux/kvm.h
> >> @@ -1567,7 +1567,9 @@ struct kvm_memory_attributes {
> >>  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
> >>  
> >>  #define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
> >> +
> >>  #define GUEST_MEMFD_FLAG_SUPPORT_SHARED	(1UL << 0)
> >> +#define GUEST_MEMFD_FLAG_INIT_PRIVATE	(1UL << 1)
> >>  
> >>  struct kvm_create_guest_memfd {
> >>  	__u64 size;
> >> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> >> index 239d0f13dcc1..590932499eba 100644
> >> --- a/virt/kvm/guest_memfd.c
> >> +++ b/virt/kvm/guest_memfd.c
> >> @@ -4,6 +4,7 @@
> >>  #include <linux/falloc.h>
> >>  #include <linux/fs.h>
> >>  #include <linux/kvm_host.h>
> >> +#include <linux/maple_tree.h>
> >>  #include <linux/pseudo_fs.h>
> >>  #include <linux/pagemap.h>
> >>  
> >> @@ -17,6 +18,24 @@ struct kvm_gmem {
> >>  	struct list_head entry;
> >>  };
> >>  
> >> +struct kvm_gmem_inode_private {
> >> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> >> +	struct maple_tree shareability;
> >> +#endif
> >> +};
> >> +
> >> +enum shareability {
> >> +	SHAREABILITY_GUEST = 1,	/* Only the guest can map (fault) folios in this range. */
> >> +	SHAREABILITY_ALL = 2,	/* Both guest and host can fault folios in this range. */
> >> +};
> >> +
> >> +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
> >> +
> >> +static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
> >> +{
> >> +	return inode->i_mapping->i_private_data;
> >> +}
> >> +
> >>  /**
> >>   * folio_file_pfn - like folio_file_page, but return a pfn.
> >>   * @folio: The folio which contains this index.
> >> @@ -29,6 +48,58 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
> >>  	return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
> >>  }
> >>  
> >> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> >> +
> >> +static int kvm_gmem_shareability_setup(struct kvm_gmem_inode_private *private,
> >> +				      loff_t size, u64 flags)
> >> +{
> >> +	enum shareability m;
> >> +	pgoff_t last;
> >> +
> >> +	last = (size >> PAGE_SHIFT) - 1;
> >> +	m = flags & GUEST_MEMFD_FLAG_INIT_PRIVATE ? SHAREABILITY_GUEST :
> >> +						    SHAREABILITY_ALL;
> >> +	return mtree_store_range(&private->shareability, 0, last, xa_mk_value(m),
> >> +				 GFP_KERNEL);
> >
> > One really nice thing about using a maple tree is that it should get rid
> > of a fairly significant startup delay for SNP/TDX when the entire xarray gets
> > initialized with private attribute entries via KVM_SET_MEMORY_ATTRIBUTES
> > (which is the current QEMU default behavior).
> >
> > I'd originally advocated for sticking with the xarray implementation Fuad was
> > using until we'd determined we really need it for HugeTLB support, but I'm
> > sort of thinking it's already justified just based on the above.
> >
> > Maybe it would make sense for KVM memory attributes too?
> >
> >> +}
> >> +
> >> +static enum shareability kvm_gmem_shareability_get(struct inode *inode,
> >> +						 pgoff_t index)
> >> +{
> >> +	struct maple_tree *mt;
> >> +	void *entry;
> >> +
> >> +	mt = &kvm_gmem_private(inode)->shareability;
> >> +	entry = mtree_load(mt, index);
> >> +	WARN(!entry,
> >> +	     "Shareability should always be defined for all indices in inode.");
> >> +
> >> +	return xa_to_value(entry);
> >> +}
> >> +
> >> +static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> >> +{
> >> +	if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
> >> +		return ERR_PTR(-EACCES);
> >> +
> >> +	return kvm_gmem_get_folio(inode, index);
> >> +}
> >> +
> >> +#else
> >> +
> >> +static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
> >> +{
> >> +	return 0;
> >> +}
> >> +
> >> +static inline struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> >> +{
> >> +	WARN_ONCE("Unexpected call to get shared folio.")
> >> +	return NULL;
> >> +}
> >> +
> >> +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
> >> +
> >>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> >>  				    pgoff_t index, struct folio *folio)
> >>  {
> >> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
> >>  
> >>  	filemap_invalidate_lock_shared(inode->i_mapping);
> >>  
> >> -	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> >> +	folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
> >>  	if (IS_ERR(folio)) {
> >>  		int err = PTR_ERR(folio);
> >>  
> >> @@ -420,8 +491,33 @@ static struct file_operations kvm_gmem_fops = {
> >>  	.fallocate	= kvm_gmem_fallocate,
> >>  };
> >>  
> >> +static void kvm_gmem_free_inode(struct inode *inode)
> >> +{
> >> +	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> >> +
> >> +	kfree(private);
> >> +
> >> +	free_inode_nonrcu(inode);
> >> +}
> >> +
> >> +static void kvm_gmem_destroy_inode(struct inode *inode)
> >> +{
> >> +	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> >> +
> >> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> >> +	/*
> >> +	 * mtree_destroy() can't be used within rcu callback, hence can't be
> >> +	 * done in ->free_inode().
> >> +	 */
> >> +	if (private)
> >> +		mtree_destroy(&private->shareability);
> >> +#endif
> >> +}
> >> +
> >>  static const struct super_operations kvm_gmem_super_operations = {
> >>  	.statfs		= simple_statfs,
> >> +	.destroy_inode	= kvm_gmem_destroy_inode,
> >> +	.free_inode	= kvm_gmem_free_inode,
> >>  };
> >>  
> >>  static int kvm_gmem_init_fs_context(struct fs_context *fc)
> >> @@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
> >>  static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
> >>  						      loff_t size, u64 flags)
> >>  {
> >> +	struct kvm_gmem_inode_private *private;
> >>  	struct inode *inode;
> >> +	int err;
> >>  
> >>  	inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
> >>  	if (IS_ERR(inode))
> >>  		return inode;
> >>  
> >> +	err = -ENOMEM;
> >> +	private = kzalloc(sizeof(*private), GFP_KERNEL);
> >> +	if (!private)
> >> +		goto out;
> >> +
> >> +	mt_init(&private->shareability);
> >> +	inode->i_mapping->i_private_data = private;
> >> +
> >> +	err = kvm_gmem_shareability_setup(private, size, flags);
> >> +	if (err)
> >> +		goto out;
> >> +
> >>  	inode->i_private = (void *)(unsigned long)flags;
> >>  	inode->i_op = &kvm_gmem_iops;
> >>  	inode->i_mapping->a_ops = &kvm_gmem_aops;
> >> @@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
> >>  	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
> >>  
> >>  	return inode;
> >> +
> >> +out:
> >> +	iput(inode);
> >> +
> >> +	return ERR_PTR(err);
> >>  }
> >>  
> >>  static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
> >> @@ -654,6 +769,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
> >>  	if (kvm_arch_vm_supports_gmem_shared_mem(kvm))
> >>  		valid_flags |= GUEST_MEMFD_FLAG_SUPPORT_SHARED;
> >>  
> >> +	if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED)
> >> +		valid_flags |= GUEST_MEMFD_FLAG_INIT_PRIVATE;
> >> +
> >>  	if (flags & ~valid_flags)
> >>  		return -EINVAL;
> >>  
> >> @@ -842,6 +960,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> >>  	if (!file)
> >>  		return -EFAULT;
> >>  
> >> +	filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
> >> +
> >
> > I like the idea of using a write-lock/read-lock to protect write/read access
> > to shareability state (though maybe not necessarily re-using filemap's
> > invalidate lock), it's simple and still allows concurrent faulting in of gmem
> > pages. One issue on the SNP side (which also came up in one of the gmem calls)
> > is if we introduce support for tracking preparedness as discussed (e.g. via a
> > new SHAREABILITY_GUEST_PREPARED state) the
> > SHAREABILITY_GUEST->SHAREABILITY_GUEST_PREPARED transition would occur at
> > fault-time, and so would need to take the write-lock and no longer allow for
> > concurrent fault-handling.
> >
> > I was originally planning on introducing a new rw_semaphore with similar
> > semantics to the rw_lock that Fuad previously had in his restricted mmap
> > series[1] (and simiar semantics to filemap invalidate lock here). The main
> > difference, to handle setting SHAREABILITY_GUEST_PREPARED within fault paths,
> > was that in the case of a folio being present for an index, the folio lock would
> > also need to be held in order to update the shareability state. Because
> > of that, fault paths (which will always either have or allocate folio
> > basically) can rely on the folio lock to guard shareability state in a more
> > granular way and so can avoid a global write lock.
> >
> > They would still need to hold the read lock to access the tree however.
> > Or more specifically, any paths that could allocate a folio need to take
> > a read lock so there isn't a TOCTOU situation where shareability is
> > being updated for an index for which a folio hasn't been allocated, but
> > then just afterward the folio gets faulted in/allocated while the
> > shareability state is already being updated which the understand that
> > there was no folio around that needed locking.
> >
> > I had a branch with in-place conversion support for SNP[2] that added this
> > lock reworking on top of Fuad's series along with preparation tracking,
> > but I'm now planning to rebase that on top of the patches from this
> > series that Sean mentioned[3] earlier:
> >
> >   KVM: guest_memfd: Add CAP KVM_CAP_GMEM_CONVERSION
> >   KVM: Query guest_memfd for private/shared status
> >   KVM: guest_memfd: Skip LRU for guest_memfd folios
> >   KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
> >   KVM: guest_memfd: Introduce and use shareability to guard faulting
> >   KVM: guest_memfd: Make guest mem use guest mem inodes instead of anonymous inodes
> >
> > but figured I'd mention it here in case there are other things to consider on
> > the locking front.
> >
> > Definitely agree with Sean though that it would be nice to start identifying a
> > common base of patches for the in-place conversion enablement for SNP, TDX, and
> > pKVM so the APIs/interfaces for hugepages can be handled separately.
> >
> > -Mike
> >
> > [1] https://lore.kernel.org/kvm/20250328153133.3504118-1-tabba@google.com/
> > [2] https://github.com/mdroth/linux/commits/mmap-swprot-v10-snp0-wip2/
> > [3] https://lore.kernel.org/kvm/aC86OsU2HSFZkJP6@google.com/
> >
> >>  	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> >>  	if (IS_ERR(folio)) {
> >>  		r = PTR_ERR(folio);
> >> @@ -857,8 +977,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> >>  		*page = folio_file_page(folio, index);
> >>  	else
> >>  		folio_put(folio);
> >> -
> >>  out:
> >> +	filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
> >>  	fput(file);
> >>  	return r;
> >>  }
> >> -- 
> >> 2.49.0.1045.g170613ef41-goog
> >> 
>

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Vishal Annapurve 7 months, 1 week ago

On Wed, Jul 2, 2025 at 4:25 PM Michael Roth <michael.roth@amd.com> wrote:
>
> On Wed, Jun 11, 2025 at 02:51:38PM -0700, Ackerley Tng wrote:
> > Michael Roth <michael.roth@amd.com> writes:
> >
> > > On Wed, May 14, 2025 at 04:41:41PM -0700, Ackerley Tng wrote:
> > >> Track guest_memfd memory's shareability status within the inode as
> > >> opposed to the file, since it is property of the guest_memfd's memory
> > >> contents.
> > >>
> > >> Shareability is a property of the memory and is indexed using the
> > >> page's index in the inode. Because shareability is the memory's
> > >> property, it is stored within guest_memfd instead of within KVM, like
> > >> in kvm->mem_attr_array.
> > >>
> > >> KVM_MEMORY_ATTRIBUTE_PRIVATE in kvm->mem_attr_array must still be
> > >> retained to allow VMs to only use guest_memfd for private memory and
> > >> some other memory for shared memory.
> > >>
> > >> Not all use cases require guest_memfd() to be shared with the host
> > >> when first created. Add a new flag, GUEST_MEMFD_FLAG_INIT_PRIVATE,
> > >> which when set on KVM_CREATE_GUEST_MEMFD, initializes the memory as
> > >> private to the guest, and therefore not mappable by the
> > >> host. Otherwise, memory is shared until explicitly converted to
> > >> private.
> > >>
> > >> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> > >> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> > >> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> > >> Co-developed-by: Fuad Tabba <tabba@google.com>
> > >> Signed-off-by: Fuad Tabba <tabba@google.com>
> > >> Change-Id: If03609cbab3ad1564685c85bdba6dcbb6b240c0f
> > >> ---
> > >>  Documentation/virt/kvm/api.rst |   5 ++
> > >>  include/uapi/linux/kvm.h       |   2 +
> > >>  virt/kvm/guest_memfd.c         | 124 ++++++++++++++++++++++++++++++++-
> > >>  3 files changed, 129 insertions(+), 2 deletions(-)
> > >>
> > >> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> > >> index 86f74ce7f12a..f609337ae1c2 100644
> > >> --- a/Documentation/virt/kvm/api.rst
> > >> +++ b/Documentation/virt/kvm/api.rst
> > >> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
> > >>  The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
> > >>  This is validated when the guest_memfd instance is bound to the VM.
> > >>
> > >> +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
> > >> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.  Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
> > >> +will initialize the memory for the guest_memfd as guest-only and not faultable
> > >> +by the host.
> > >> +
> > >
> > > KVM_CAP_GMEM_CONVERSION doesn't get introduced until later, so it seems
> > > like this flag should be deferred until that patch is in place. Is it
> > > really needed at that point though? Userspace would be able to set the
> > > initial state via KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls.
> > >
> >
> > I can move this change to the later patch. Thanks! Will fix in the next
> > revision.
> >
> > > The mtree contents seems to get stored in the same manner in either case so
> > > performance-wise only the overhead of a few userspace<->kernel switches
> > > would be saved. Are there any other reasons?
> > >
> > > Otherwise, maybe just settle on SHARED as a documented default (since at
> > > least non-CoCo VMs would be able to reliably benefit) and let
> > > CoCo/GUEST_MEMFD_FLAG_SUPPORT_SHARED VMs set PRIVATE at whatever
> > > granularity makes sense for the architecture/guest configuration.
> > >
> >
> > Because shared pages are split once any memory is allocated, having a
> > way to INIT_PRIVATE could avoid the split and then merge on
> > conversion. I feel that is enough value to have this config flag, what
> > do you think?
> >
> > I guess we could also have userspace be careful not to do any allocation
> > before converting.
>
> I assume we do want to support things like preallocating guest memory so
> not sure this approach is feasible to avoid splits.
>
> But I feel like we might be working around a deeper issue here, which is
> that we are pre-emptively splitting anything that *could* be mapped into
> userspace (i.e. allocated+shared/mixed), rather than splitting when
> necessary.
>
> I know that was the plan laid out in the guest_memfd calls, but I've run
> into a couple instances that have me thinking we should revisit this.
>
> 1) Some of the recent guest_memfd seems to be gravitating towards having
>    userspace populate/initialize guest memory payload prior to boot via
>    mmap()'ing the shared guest_memfd pages so things work the same as
>    they would for initialized normal VM memory payload (rather than
>    relying on back-channels in the kernel to user data into guest_memfd
>    pages).
>
>    When you do this though, for an SNP guest at least, that memory
>    acceptance is done in chunks of 4MB (with accept_memory=lazy), and
>    because that will put each 1GB page into an allocated+mixed state,

I would like your help in understanding why we need to start
guest_memfd ranges as shared for SNP guests. guest_memfd ranges being
private simply should mean that certain ranges are not faultable by
the userspace.

Will following work?
1) Userspace starts all guest_memfd ranges as private.
2) During early guest boot it starts issuing PSC requests for
converting memory from shared to private
    -> KVM forwards this request to userspace
    -> Userspace checks that the pages are already private and simply
does nothing.
3) Pvalidate from guest on that memory will result in guest_memfd
offset query which will cause the RMP table entries to actually get
populated.

>    we end up splitting every 1GB to 4K and the guest can't even
>    accept/PVALIDATE it 2MB at that point even if userspace doesn't touch
>    anything in the range. As some point the guest will convert/accept
>    the entire range, at which point we could merge, but for SNP we'd
>    need guest cooperation to actually use a higher-granularity in stage2
>    page tables at that point since RMP entries are effectively all split
>    to 4K.
>
>    I understand the intent is to default to private where this wouldn't
>    be an issue, and we could punt to userspace to deal with it, but it
>    feels like an artificial restriction to place on userspace. And if we
>    do want to allow/expect guest_memfd contents to be initialized pre-boot
>    just like normal memory, then userspace would need to jump through
>    some hoops:
>
>    - if defaulting to private: add hooks to convert each range that's being
>      modified to a shared state prior to writing to it

Why is that a problem?

>    - if defaulting to shared: initialize memory in-place, then covert
>      everything else to private to avoid unecessarily splitting folios
>      at run-time
>
>    It feels like implementations details are bleeding out into the API
>    to some degree here (e.g. we'd probably at least need to document
>    this so users know how to take proper advantage of hugepage support).

Does it make sense to keep the default behavior as INIT_PRIVATE for
SNP VMs always even without using hugepages?

>
> 2) There are some use-cases for HugeTLB + CoCo that have come to my
>    attention recently that put a lot of weight on still being able to
>    maximize mapping/hugepage size when accessing shared mem from userspace,
>    e.g. for certain DPDK workloads that accessed shared guest buffers
>    from host userspace. We don't really have a story for this, and I
>    wouldn't expect us to at this stage, but I think it ties into #1 so
>    might be worth considering in that context.

Major problem I see here is that if anything in the kernel does a GUP
on shared memory ranges (which is very likely to happen), it would be
difficult to get them to let go of the whole hugepage before it can be
split safely.

Another problem is guest_memfd today doesn't support management of
large user space page table mappings, this can turnout to be
significant work to do referring to hugetlb pagetable management
logic.

>
> I'm still fine with the current approach as a starting point, but I'm
> wondering if improving both #1/#2 might not be so bad and maybe even
> give us some more flexibility (for instance, Sean had mentioned leaving
> open the option of tracking more than just shareability/mappability, and
> if there is split/merge logic associated with those transitions then
> re-scanning each of these attributes for a 1G range seems like it could
> benefit from some sort of intermediate data structure to help determine
> things like what mapping granularity is available for guest/userspace
> for a particular range.
>
> One approach I was thinking of was that we introduce a data structure
> similar to KVM's memslot->arch.lpage_info() where we store information
> about what 1G/2M ranges are shared/private/mixed, and then instead of
> splitting ahead of time we just record that state into this data
> structure (using the same write lock as with the
> shareability/mappability state), and then at *fault* time we split the
> folio if our lpage_info-like data structure says the range is mixed.
>
> Then, if guest converts a 2M/4M range to private while lazilly-accepting
> (for instance), we can still keep the folio intact as 1GB, but mark
> the 1G range in the lpage_info-like data structure as mixed so that we
> still inform KVM/etc. they need to map it as 2MB or lower in stage2
> page tables. In that case, even at guest fault-time, we can leave the
> folio unsplit until userspace tries to touch it (though in most cases
> it never will and we can keep most of the guest's 1G intact for the
> duration of its lifetime).
>
> On the userspace side, another nice thing there is if we see 1G is in a
> mixed state, but 2M is all-shared, then we can still leave the folio as 2M,
> and I think the refcount'ing logic would still work for the most part,
> which makes #2 a bit easier to implement as well.
>
> And of course, we wouldn't need the INIT_PRIVATE then since we are only
> splitting when necessary.
>
> But I guess this all comes down to how much extra pain there is in
> tracking a 1G folio that's been split into a mixed of 2MB/4K regions,
> but I think we'd get a lot more mileage out of getting that working and
> just completely stripping out all of the merging logic for initial
> implementation (other than at cleanup time), so maybe complexity-wise
> it balances out a bit?
>
> Thanks,
>
> Mike
>
> >
> > >>  See KVM_SET_USER_MEMORY_REGION2 for additional details.
> > >>
> > >>  4.143 KVM_PRE_FAULT_MEMORY
> > >> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > >> index 4cc824a3a7c9..d7df312479aa 100644
> > >> --- a/include/uapi/linux/kvm.h
> > >> +++ b/include/uapi/linux/kvm.h
> > >> @@ -1567,7 +1567,9 @@ struct kvm_memory_attributes {
> > >>  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
> > >>
> > >>  #define KVM_CREATE_GUEST_MEMFD    _IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
> > >> +
> > >>  #define GUEST_MEMFD_FLAG_SUPPORT_SHARED   (1UL << 0)
> > >> +#define GUEST_MEMFD_FLAG_INIT_PRIVATE     (1UL << 1)
> > >>
> > >>  struct kvm_create_guest_memfd {
> > >>    __u64 size;
> > >> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> > >> index 239d0f13dcc1..590932499eba 100644
> > >> --- a/virt/kvm/guest_memfd.c
> > >> +++ b/virt/kvm/guest_memfd.c
> > >> @@ -4,6 +4,7 @@
> > >>  #include <linux/falloc.h>
> > >>  #include <linux/fs.h>
> > >>  #include <linux/kvm_host.h>
> > >> +#include <linux/maple_tree.h>
> > >>  #include <linux/pseudo_fs.h>
> > >>  #include <linux/pagemap.h>
> > >>
> > >> @@ -17,6 +18,24 @@ struct kvm_gmem {
> > >>    struct list_head entry;
> > >>  };
> > >>
> > >> +struct kvm_gmem_inode_private {
> > >> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> > >> +  struct maple_tree shareability;
> > >> +#endif
> > >> +};
> > >> +
> > >> +enum shareability {
> > >> +  SHAREABILITY_GUEST = 1, /* Only the guest can map (fault) folios in this range. */
> > >> +  SHAREABILITY_ALL = 2,   /* Both guest and host can fault folios in this range. */
> > >> +};
> > >> +
> > >> +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
> > >> +
> > >> +static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
> > >> +{
> > >> +  return inode->i_mapping->i_private_data;
> > >> +}
> > >> +
> > >>  /**
> > >>   * folio_file_pfn - like folio_file_page, but return a pfn.
> > >>   * @folio: The folio which contains this index.
> > >> @@ -29,6 +48,58 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
> > >>    return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
> > >>  }
> > >>
> > >> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> > >> +
> > >> +static int kvm_gmem_shareability_setup(struct kvm_gmem_inode_private *private,
> > >> +                                loff_t size, u64 flags)
> > >> +{
> > >> +  enum shareability m;
> > >> +  pgoff_t last;
> > >> +
> > >> +  last = (size >> PAGE_SHIFT) - 1;
> > >> +  m = flags & GUEST_MEMFD_FLAG_INIT_PRIVATE ? SHAREABILITY_GUEST :
> > >> +                                              SHAREABILITY_ALL;
> > >> +  return mtree_store_range(&private->shareability, 0, last, xa_mk_value(m),
> > >> +                           GFP_KERNEL);
> > >
> > > One really nice thing about using a maple tree is that it should get rid
> > > of a fairly significant startup delay for SNP/TDX when the entire xarray gets
> > > initialized with private attribute entries via KVM_SET_MEMORY_ATTRIBUTES
> > > (which is the current QEMU default behavior).
> > >
> > > I'd originally advocated for sticking with the xarray implementation Fuad was
> > > using until we'd determined we really need it for HugeTLB support, but I'm
> > > sort of thinking it's already justified just based on the above.
> > >
> > > Maybe it would make sense for KVM memory attributes too?
> > >
> > >> +}
> > >> +
> > >> +static enum shareability kvm_gmem_shareability_get(struct inode *inode,
> > >> +                                           pgoff_t index)
> > >> +{
> > >> +  struct maple_tree *mt;
> > >> +  void *entry;
> > >> +
> > >> +  mt = &kvm_gmem_private(inode)->shareability;
> > >> +  entry = mtree_load(mt, index);
> > >> +  WARN(!entry,
> > >> +       "Shareability should always be defined for all indices in inode.");
> > >> +
> > >> +  return xa_to_value(entry);
> > >> +}
> > >> +
> > >> +static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> > >> +{
> > >> +  if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
> > >> +          return ERR_PTR(-EACCES);
> > >> +
> > >> +  return kvm_gmem_get_folio(inode, index);
> > >> +}
> > >> +
> > >> +#else
> > >> +
> > >> +static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
> > >> +{
> > >> +  return 0;
> > >> +}
> > >> +
> > >> +static inline struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> > >> +{
> > >> +  WARN_ONCE("Unexpected call to get shared folio.")
> > >> +  return NULL;
> > >> +}
> > >> +
> > >> +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
> > >> +
> > >>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> > >>                                pgoff_t index, struct folio *folio)
> > >>  {
> > >> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
> > >>
> > >>    filemap_invalidate_lock_shared(inode->i_mapping);
> > >>
> > >> -  folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> > >> +  folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
> > >>    if (IS_ERR(folio)) {
> > >>            int err = PTR_ERR(folio);
> > >>
> > >> @@ -420,8 +491,33 @@ static struct file_operations kvm_gmem_fops = {
> > >>    .fallocate      = kvm_gmem_fallocate,
> > >>  };
> > >>
> > >> +static void kvm_gmem_free_inode(struct inode *inode)
> > >> +{
> > >> +  struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> > >> +
> > >> +  kfree(private);
> > >> +
> > >> +  free_inode_nonrcu(inode);
> > >> +}
> > >> +
> > >> +static void kvm_gmem_destroy_inode(struct inode *inode)
> > >> +{
> > >> +  struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> > >> +
> > >> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> > >> +  /*
> > >> +   * mtree_destroy() can't be used within rcu callback, hence can't be
> > >> +   * done in ->free_inode().
> > >> +   */
> > >> +  if (private)
> > >> +          mtree_destroy(&private->shareability);
> > >> +#endif
> > >> +}
> > >> +
> > >>  static const struct super_operations kvm_gmem_super_operations = {
> > >>    .statfs         = simple_statfs,
> > >> +  .destroy_inode  = kvm_gmem_destroy_inode,
> > >> +  .free_inode     = kvm_gmem_free_inode,
> > >>  };
> > >>
> > >>  static int kvm_gmem_init_fs_context(struct fs_context *fc)
> > >> @@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
> > >>  static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
> > >>                                                  loff_t size, u64 flags)
> > >>  {
> > >> +  struct kvm_gmem_inode_private *private;
> > >>    struct inode *inode;
> > >> +  int err;
> > >>
> > >>    inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
> > >>    if (IS_ERR(inode))
> > >>            return inode;
> > >>
> > >> +  err = -ENOMEM;
> > >> +  private = kzalloc(sizeof(*private), GFP_KERNEL);
> > >> +  if (!private)
> > >> +          goto out;
> > >> +
> > >> +  mt_init(&private->shareability);
> > >> +  inode->i_mapping->i_private_data = private;
> > >> +
> > >> +  err = kvm_gmem_shareability_setup(private, size, flags);
> > >> +  if (err)
> > >> +          goto out;
> > >> +
> > >>    inode->i_private = (void *)(unsigned long)flags;
> > >>    inode->i_op = &kvm_gmem_iops;
> > >>    inode->i_mapping->a_ops = &kvm_gmem_aops;
> > >> @@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
> > >>    WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
> > >>
> > >>    return inode;
> > >> +
> > >> +out:
> > >> +  iput(inode);
> > >> +
> > >> +  return ERR_PTR(err);
> > >>  }
> > >>
> > >>  static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
> > >> @@ -654,6 +769,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
> > >>    if (kvm_arch_vm_supports_gmem_shared_mem(kvm))
> > >>            valid_flags |= GUEST_MEMFD_FLAG_SUPPORT_SHARED;
> > >>
> > >> +  if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED)
> > >> +          valid_flags |= GUEST_MEMFD_FLAG_INIT_PRIVATE;
> > >> +
> > >>    if (flags & ~valid_flags)
> > >>            return -EINVAL;
> > >>
> > >> @@ -842,6 +960,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > >>    if (!file)
> > >>            return -EFAULT;
> > >>
> > >> +  filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
> > >> +
> > >
> > > I like the idea of using a write-lock/read-lock to protect write/read access
> > > to shareability state (though maybe not necessarily re-using filemap's
> > > invalidate lock), it's simple and still allows concurrent faulting in of gmem
> > > pages. One issue on the SNP side (which also came up in one of the gmem calls)
> > > is if we introduce support for tracking preparedness as discussed (e.g. via a
> > > new SHAREABILITY_GUEST_PREPARED state) the
> > > SHAREABILITY_GUEST->SHAREABILITY_GUEST_PREPARED transition would occur at
> > > fault-time, and so would need to take the write-lock and no longer allow for
> > > concurrent fault-handling.
> > >
> > > I was originally planning on introducing a new rw_semaphore with similar
> > > semantics to the rw_lock that Fuad previously had in his restricted mmap
> > > series[1] (and simiar semantics to filemap invalidate lock here). The main
> > > difference, to handle setting SHAREABILITY_GUEST_PREPARED within fault paths,
> > > was that in the case of a folio being present for an index, the folio lock would
> > > also need to be held in order to update the shareability state. Because
> > > of that, fault paths (which will always either have or allocate folio
> > > basically) can rely on the folio lock to guard shareability state in a more
> > > granular way and so can avoid a global write lock.
> > >
> > > They would still need to hold the read lock to access the tree however.
> > > Or more specifically, any paths that could allocate a folio need to take
> > > a read lock so there isn't a TOCTOU situation where shareability is
> > > being updated for an index for which a folio hasn't been allocated, but
> > > then just afterward the folio gets faulted in/allocated while the
> > > shareability state is already being updated which the understand that
> > > there was no folio around that needed locking.
> > >
> > > I had a branch with in-place conversion support for SNP[2] that added this
> > > lock reworking on top of Fuad's series along with preparation tracking,
> > > but I'm now planning to rebase that on top of the patches from this
> > > series that Sean mentioned[3] earlier:
> > >
> > >   KVM: guest_memfd: Add CAP KVM_CAP_GMEM_CONVERSION
> > >   KVM: Query guest_memfd for private/shared status
> > >   KVM: guest_memfd: Skip LRU for guest_memfd folios
> > >   KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
> > >   KVM: guest_memfd: Introduce and use shareability to guard faulting
> > >   KVM: guest_memfd: Make guest mem use guest mem inodes instead of anonymous inodes
> > >
> > > but figured I'd mention it here in case there are other things to consider on
> > > the locking front.
> > >
> > > Definitely agree with Sean though that it would be nice to start identifying a
> > > common base of patches for the in-place conversion enablement for SNP, TDX, and
> > > pKVM so the APIs/interfaces for hugepages can be handled separately.
> > >
> > > -Mike
> > >
> > > [1] https://lore.kernel.org/kvm/20250328153133.3504118-1-tabba@google.com/
> > > [2] https://github.com/mdroth/linux/commits/mmap-swprot-v10-snp0-wip2/
> > > [3] https://lore.kernel.org/kvm/aC86OsU2HSFZkJP6@google.com/
> > >
> > >>    folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> > >>    if (IS_ERR(folio)) {
> > >>            r = PTR_ERR(folio);
> > >> @@ -857,8 +977,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > >>            *page = folio_file_page(folio, index);
> > >>    else
> > >>            folio_put(folio);
> > >> -
> > >>  out:
> > >> +  filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
> > >>    fput(file);
> > >>    return r;
> > >>  }
> > >> --
> > >> 2.49.0.1045.g170613ef41-goog
> > >>
> >

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Michael Roth 7 months, 1 week ago

On Wed, Jul 02, 2025 at 05:46:23PM -0700, Vishal Annapurve wrote:
> On Wed, Jul 2, 2025 at 4:25 PM Michael Roth <michael.roth@amd.com> wrote:
> >
> > On Wed, Jun 11, 2025 at 02:51:38PM -0700, Ackerley Tng wrote:
> > > Michael Roth <michael.roth@amd.com> writes:
> > >
> > > > On Wed, May 14, 2025 at 04:41:41PM -0700, Ackerley Tng wrote:
> > > >> Track guest_memfd memory's shareability status within the inode as
> > > >> opposed to the file, since it is property of the guest_memfd's memory
> > > >> contents.
> > > >>
> > > >> Shareability is a property of the memory and is indexed using the
> > > >> page's index in the inode. Because shareability is the memory's
> > > >> property, it is stored within guest_memfd instead of within KVM, like
> > > >> in kvm->mem_attr_array.
> > > >>
> > > >> KVM_MEMORY_ATTRIBUTE_PRIVATE in kvm->mem_attr_array must still be
> > > >> retained to allow VMs to only use guest_memfd for private memory and
> > > >> some other memory for shared memory.
> > > >>
> > > >> Not all use cases require guest_memfd() to be shared with the host
> > > >> when first created. Add a new flag, GUEST_MEMFD_FLAG_INIT_PRIVATE,
> > > >> which when set on KVM_CREATE_GUEST_MEMFD, initializes the memory as
> > > >> private to the guest, and therefore not mappable by the
> > > >> host. Otherwise, memory is shared until explicitly converted to
> > > >> private.
> > > >>
> > > >> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> > > >> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> > > >> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> > > >> Co-developed-by: Fuad Tabba <tabba@google.com>
> > > >> Signed-off-by: Fuad Tabba <tabba@google.com>
> > > >> Change-Id: If03609cbab3ad1564685c85bdba6dcbb6b240c0f
> > > >> ---
> > > >>  Documentation/virt/kvm/api.rst |   5 ++
> > > >>  include/uapi/linux/kvm.h       |   2 +
> > > >>  virt/kvm/guest_memfd.c         | 124 ++++++++++++++++++++++++++++++++-
> > > >>  3 files changed, 129 insertions(+), 2 deletions(-)
> > > >>
> > > >> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> > > >> index 86f74ce7f12a..f609337ae1c2 100644
> > > >> --- a/Documentation/virt/kvm/api.rst
> > > >> +++ b/Documentation/virt/kvm/api.rst
> > > >> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
> > > >>  The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
> > > >>  This is validated when the guest_memfd instance is bound to the VM.
> > > >>
> > > >> +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
> > > >> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.  Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
> > > >> +will initialize the memory for the guest_memfd as guest-only and not faultable
> > > >> +by the host.
> > > >> +
> > > >
> > > > KVM_CAP_GMEM_CONVERSION doesn't get introduced until later, so it seems
> > > > like this flag should be deferred until that patch is in place. Is it
> > > > really needed at that point though? Userspace would be able to set the
> > > > initial state via KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls.
> > > >
> > >
> > > I can move this change to the later patch. Thanks! Will fix in the next
> > > revision.
> > >
> > > > The mtree contents seems to get stored in the same manner in either case so
> > > > performance-wise only the overhead of a few userspace<->kernel switches
> > > > would be saved. Are there any other reasons?
> > > >
> > > > Otherwise, maybe just settle on SHARED as a documented default (since at
> > > > least non-CoCo VMs would be able to reliably benefit) and let
> > > > CoCo/GUEST_MEMFD_FLAG_SUPPORT_SHARED VMs set PRIVATE at whatever
> > > > granularity makes sense for the architecture/guest configuration.
> > > >
> > >
> > > Because shared pages are split once any memory is allocated, having a
> > > way to INIT_PRIVATE could avoid the split and then merge on
> > > conversion. I feel that is enough value to have this config flag, what
> > > do you think?
> > >
> > > I guess we could also have userspace be careful not to do any allocation
> > > before converting.

(Re-visiting this with the assumption that we *don't* intend to use mmap() to
populate memory (in which case you can pretty much ignore my previous
response))

I'm still not sure where the INIT_PRIVATE flag comes into play. For SNP,
userspace already defaults to marking everything private pretty close to
guest_memfd creation time, so the potential for allocations to occur
in-between seems small, but worth confirming.

But I know in the past there was a desire to ensure TDX/SNP could
support pre-allocating guest_memfd memory (and even pre-faulting via
KVM_PRE_FAULT_MEMORY), but I think that could still work right? The
fallocate() handling could still avoid the split if the whole hugepage
is private, though there is a bit more potential for that fallocate()
to happen before userspace does the "manually" shared->private
conversion. I'll double-check on that aspect, but otherwise, is there
still any other need for it?

> >
> > I assume we do want to support things like preallocating guest memory so
> > not sure this approach is feasible to avoid splits.
> >
> > But I feel like we might be working around a deeper issue here, which is
> > that we are pre-emptively splitting anything that *could* be mapped into
> > userspace (i.e. allocated+shared/mixed), rather than splitting when
> > necessary.
> >
> > I know that was the plan laid out in the guest_memfd calls, but I've run
> > into a couple instances that have me thinking we should revisit this.
> >
> > 1) Some of the recent guest_memfd seems to be gravitating towards having
> >    userspace populate/initialize guest memory payload prior to boot via
> >    mmap()'ing the shared guest_memfd pages so things work the same as
> >    they would for initialized normal VM memory payload (rather than
> >    relying on back-channels in the kernel to user data into guest_memfd
> >    pages).
> >
> >    When you do this though, for an SNP guest at least, that memory
> >    acceptance is done in chunks of 4MB (with accept_memory=lazy), and
> >    because that will put each 1GB page into an allocated+mixed state,
> 
> I would like your help in understanding why we need to start
> guest_memfd ranges as shared for SNP guests. guest_memfd ranges being
> private simply should mean that certain ranges are not faultable by
> the userspace.

It's seeming like I probably misremembered, but I thought there was a
discussion on guest_memfd call a month (or so?) ago about whether to
continue to use backchannels to populate guest_memfd pages prior to
launch. It was in the context of whether to keep using kvm_gmem_populate()
for populating guest_memfd pages by copying them in from separate
userspace buffer vs. simply populating them directly from userspace.
I thought we were leaning on the latter since it was simpler all-around,
which is great for SNP since that is already how it populates memory: by
writing to it from userspace, which kvm_gmem_populate() then copies into
guest_memfd pages. With shared gmem support, we just skip the latter now
in the kernel rather needing changes to how userspace handles things in
that regard. But maybe that was just wishful thinking :)

But you raise some very compelling points on why this might not be a
good idea even if that was how that discussion went.

> 
> Will following work?
> 1) Userspace starts all guest_memfd ranges as private.
> 2) During early guest boot it starts issuing PSC requests for
> converting memory from shared to private
>     -> KVM forwards this request to userspace
>     -> Userspace checks that the pages are already private and simply
> does nothing.
> 3) Pvalidate from guest on that memory will result in guest_memfd
> offset query which will cause the RMP table entries to actually get
> populated.

That would work, but there will need to be changes on userspace to deal
with how SNP populates memory pre-boot just like normal VMs do. We will
instead need to copy that data into separate buffers, and pass those in
as the buffer hva instead of the shared hva corresponding to that GPA.

But that seems reasonable if it avoids so many other problems.

> 
> >    we end up splitting every 1GB to 4K and the guest can't even
> >    accept/PVALIDATE it 2MB at that point even if userspace doesn't touch
> >    anything in the range. As some point the guest will convert/accept
> >    the entire range, at which point we could merge, but for SNP we'd
> >    need guest cooperation to actually use a higher-granularity in stage2
> >    page tables at that point since RMP entries are effectively all split
> >    to 4K.
> >
> >    I understand the intent is to default to private where this wouldn't
> >    be an issue, and we could punt to userspace to deal with it, but it
> >    feels like an artificial restriction to place on userspace. And if we
> >    do want to allow/expect guest_memfd contents to be initialized pre-boot
> >    just like normal memory, then userspace would need to jump through
> >    some hoops:
> >
> >    - if defaulting to private: add hooks to convert each range that's being
> >      modified to a shared state prior to writing to it
> 
> Why is that a problem?

These were only problems if we went the above-mentioned way of
populating memory pre-boot via mmap() instead of other backchannels. If
we don't do that, then both these things cease to be problems. Sounds goods
to me. :)

> 
> >    - if defaulting to shared: initialize memory in-place, then covert
> >      everything else to private to avoid unecessarily splitting folios
> >      at run-time
> >
> >    It feels like implementations details are bleeding out into the API
> >    to some degree here (e.g. we'd probably at least need to document
> >    this so users know how to take proper advantage of hugepage support).
> 
> Does it make sense to keep the default behavior as INIT_PRIVATE for
> SNP VMs always even without using hugepages?

Yes!

Though, revisiting discussion around INIT_PRIVATE (without the baggage
of potentially relying on mmap() to populate memory), I'm still not sure why
it's needed. I responded in the context of Ackerley's initial reply
above.

> 
> >
> > 2) There are some use-cases for HugeTLB + CoCo that have come to my
> >    attention recently that put a lot of weight on still being able to
> >    maximize mapping/hugepage size when accessing shared mem from userspace,
> >    e.g. for certain DPDK workloads that accessed shared guest buffers
> >    from host userspace. We don't really have a story for this, and I
> >    wouldn't expect us to at this stage, but I think it ties into #1 so
> >    might be worth considering in that context.
> 
> Major problem I see here is that if anything in the kernel does a GUP
> on shared memory ranges (which is very likely to happen), it would be
> difficult to get them to let go of the whole hugepage before it can be
> split safely.
> 
> Another problem is guest_memfd today doesn't support management of
> large user space page table mappings, this can turnout to be
> significant work to do referring to hugetlb pagetable management
> logic.

Yah that was more line-of-sight that might be possible by going this
route, but the refcount'ing issue above is a showstopper as always. I'd
somehow convinced myself that supporting fine-grained splitting somehow
worked around it, but you still have no idea what page you need to avoid
converting and fancy splitting doesn't get you past that. More wishful
thinking. =\

Thanks,

Mike

> 
> >
> > I'm still fine with the current approach as a starting point, but I'm
> > wondering if improving both #1/#2 might not be so bad and maybe even
> > give us some more flexibility (for instance, Sean had mentioned leaving
> > open the option of tracking more than just shareability/mappability, and
> > if there is split/merge logic associated with those transitions then
> > re-scanning each of these attributes for a 1G range seems like it could
> > benefit from some sort of intermediate data structure to help determine
> > things like what mapping granularity is available for guest/userspace
> > for a particular range.
> >
> > One approach I was thinking of was that we introduce a data structure
> > similar to KVM's memslot->arch.lpage_info() where we store information
> > about what 1G/2M ranges are shared/private/mixed, and then instead of
> > splitting ahead of time we just record that state into this data
> > structure (using the same write lock as with the
> > shareability/mappability state), and then at *fault* time we split the
> > folio if our lpage_info-like data structure says the range is mixed.
> >
> > Then, if guest converts a 2M/4M range to private while lazilly-accepting
> > (for instance), we can still keep the folio intact as 1GB, but mark
> > the 1G range in the lpage_info-like data structure as mixed so that we
> > still inform KVM/etc. they need to map it as 2MB or lower in stage2
> > page tables. In that case, even at guest fault-time, we can leave the
> > folio unsplit until userspace tries to touch it (though in most cases
> > it never will and we can keep most of the guest's 1G intact for the
> > duration of its lifetime).
> >
> > On the userspace side, another nice thing there is if we see 1G is in a
> > mixed state, but 2M is all-shared, then we can still leave the folio as 2M,
> > and I think the refcount'ing logic would still work for the most part,
> > which makes #2 a bit easier to implement as well.
> >
> > And of course, we wouldn't need the INIT_PRIVATE then since we are only
> > splitting when necessary.
> >
> > But I guess this all comes down to how much extra pain there is in
> > tracking a 1G folio that's been split into a mixed of 2MB/4K regions,
> > but I think we'd get a lot more mileage out of getting that working and
> > just completely stripping out all of the merging logic for initial
> > implementation (other than at cleanup time), so maybe complexity-wise
> > it balances out a bit?
> >
> > Thanks,
> >
> > Mike
> >
> > >
> > > >>  See KVM_SET_USER_MEMORY_REGION2 for additional details.
> > > >>
> > > >>  4.143 KVM_PRE_FAULT_MEMORY
> > > >> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > > >> index 4cc824a3a7c9..d7df312479aa 100644
> > > >> --- a/include/uapi/linux/kvm.h
> > > >> +++ b/include/uapi/linux/kvm.h
> > > >> @@ -1567,7 +1567,9 @@ struct kvm_memory_attributes {
> > > >>  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
> > > >>
> > > >>  #define KVM_CREATE_GUEST_MEMFD    _IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
> > > >> +
> > > >>  #define GUEST_MEMFD_FLAG_SUPPORT_SHARED   (1UL << 0)
> > > >> +#define GUEST_MEMFD_FLAG_INIT_PRIVATE     (1UL << 1)
> > > >>
> > > >>  struct kvm_create_guest_memfd {
> > > >>    __u64 size;
> > > >> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> > > >> index 239d0f13dcc1..590932499eba 100644
> > > >> --- a/virt/kvm/guest_memfd.c
> > > >> +++ b/virt/kvm/guest_memfd.c
> > > >> @@ -4,6 +4,7 @@
> > > >>  #include <linux/falloc.h>
> > > >>  #include <linux/fs.h>
> > > >>  #include <linux/kvm_host.h>
> > > >> +#include <linux/maple_tree.h>
> > > >>  #include <linux/pseudo_fs.h>
> > > >>  #include <linux/pagemap.h>
> > > >>
> > > >> @@ -17,6 +18,24 @@ struct kvm_gmem {
> > > >>    struct list_head entry;
> > > >>  };
> > > >>
> > > >> +struct kvm_gmem_inode_private {
> > > >> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> > > >> +  struct maple_tree shareability;
> > > >> +#endif
> > > >> +};
> > > >> +
> > > >> +enum shareability {
> > > >> +  SHAREABILITY_GUEST = 1, /* Only the guest can map (fault) folios in this range. */
> > > >> +  SHAREABILITY_ALL = 2,   /* Both guest and host can fault folios in this range. */
> > > >> +};
> > > >> +
> > > >> +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
> > > >> +
> > > >> +static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
> > > >> +{
> > > >> +  return inode->i_mapping->i_private_data;
> > > >> +}
> > > >> +
> > > >>  /**
> > > >>   * folio_file_pfn - like folio_file_page, but return a pfn.
> > > >>   * @folio: The folio which contains this index.
> > > >> @@ -29,6 +48,58 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
> > > >>    return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
> > > >>  }
> > > >>
> > > >> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> > > >> +
> > > >> +static int kvm_gmem_shareability_setup(struct kvm_gmem_inode_private *private,
> > > >> +                                loff_t size, u64 flags)
> > > >> +{
> > > >> +  enum shareability m;
> > > >> +  pgoff_t last;
> > > >> +
> > > >> +  last = (size >> PAGE_SHIFT) - 1;
> > > >> +  m = flags & GUEST_MEMFD_FLAG_INIT_PRIVATE ? SHAREABILITY_GUEST :
> > > >> +                                              SHAREABILITY_ALL;
> > > >> +  return mtree_store_range(&private->shareability, 0, last, xa_mk_value(m),
> > > >> +                           GFP_KERNEL);
> > > >
> > > > One really nice thing about using a maple tree is that it should get rid
> > > > of a fairly significant startup delay for SNP/TDX when the entire xarray gets
> > > > initialized with private attribute entries via KVM_SET_MEMORY_ATTRIBUTES
> > > > (which is the current QEMU default behavior).
> > > >
> > > > I'd originally advocated for sticking with the xarray implementation Fuad was
> > > > using until we'd determined we really need it for HugeTLB support, but I'm
> > > > sort of thinking it's already justified just based on the above.
> > > >
> > > > Maybe it would make sense for KVM memory attributes too?
> > > >
> > > >> +}
> > > >> +
> > > >> +static enum shareability kvm_gmem_shareability_get(struct inode *inode,
> > > >> +                                           pgoff_t index)
> > > >> +{
> > > >> +  struct maple_tree *mt;
> > > >> +  void *entry;
> > > >> +
> > > >> +  mt = &kvm_gmem_private(inode)->shareability;
> > > >> +  entry = mtree_load(mt, index);
> > > >> +  WARN(!entry,
> > > >> +       "Shareability should always be defined for all indices in inode.");
> > > >> +
> > > >> +  return xa_to_value(entry);
> > > >> +}
> > > >> +
> > > >> +static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> > > >> +{
> > > >> +  if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
> > > >> +          return ERR_PTR(-EACCES);
> > > >> +
> > > >> +  return kvm_gmem_get_folio(inode, index);
> > > >> +}
> > > >> +
> > > >> +#else
> > > >> +
> > > >> +static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
> > > >> +{
> > > >> +  return 0;
> > > >> +}
> > > >> +
> > > >> +static inline struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> > > >> +{
> > > >> +  WARN_ONCE("Unexpected call to get shared folio.")
> > > >> +  return NULL;
> > > >> +}
> > > >> +
> > > >> +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
> > > >> +
> > > >>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > >>                                pgoff_t index, struct folio *folio)
> > > >>  {
> > > >> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
> > > >>
> > > >>    filemap_invalidate_lock_shared(inode->i_mapping);
> > > >>
> > > >> -  folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> > > >> +  folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
> > > >>    if (IS_ERR(folio)) {
> > > >>            int err = PTR_ERR(folio);
> > > >>
> > > >> @@ -420,8 +491,33 @@ static struct file_operations kvm_gmem_fops = {
> > > >>    .fallocate      = kvm_gmem_fallocate,
> > > >>  };
> > > >>
> > > >> +static void kvm_gmem_free_inode(struct inode *inode)
> > > >> +{
> > > >> +  struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> > > >> +
> > > >> +  kfree(private);
> > > >> +
> > > >> +  free_inode_nonrcu(inode);
> > > >> +}
> > > >> +
> > > >> +static void kvm_gmem_destroy_inode(struct inode *inode)
> > > >> +{
> > > >> +  struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> > > >> +
> > > >> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> > > >> +  /*
> > > >> +   * mtree_destroy() can't be used within rcu callback, hence can't be
> > > >> +   * done in ->free_inode().
> > > >> +   */
> > > >> +  if (private)
> > > >> +          mtree_destroy(&private->shareability);
> > > >> +#endif
> > > >> +}
> > > >> +
> > > >>  static const struct super_operations kvm_gmem_super_operations = {
> > > >>    .statfs         = simple_statfs,
> > > >> +  .destroy_inode  = kvm_gmem_destroy_inode,
> > > >> +  .free_inode     = kvm_gmem_free_inode,
> > > >>  };
> > > >>
> > > >>  static int kvm_gmem_init_fs_context(struct fs_context *fc)
> > > >> @@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
> > > >>  static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
> > > >>                                                  loff_t size, u64 flags)
> > > >>  {
> > > >> +  struct kvm_gmem_inode_private *private;
> > > >>    struct inode *inode;
> > > >> +  int err;
> > > >>
> > > >>    inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
> > > >>    if (IS_ERR(inode))
> > > >>            return inode;
> > > >>
> > > >> +  err = -ENOMEM;
> > > >> +  private = kzalloc(sizeof(*private), GFP_KERNEL);
> > > >> +  if (!private)
> > > >> +          goto out;
> > > >> +
> > > >> +  mt_init(&private->shareability);
> > > >> +  inode->i_mapping->i_private_data = private;
> > > >> +
> > > >> +  err = kvm_gmem_shareability_setup(private, size, flags);
> > > >> +  if (err)
> > > >> +          goto out;
> > > >> +
> > > >>    inode->i_private = (void *)(unsigned long)flags;
> > > >>    inode->i_op = &kvm_gmem_iops;
> > > >>    inode->i_mapping->a_ops = &kvm_gmem_aops;
> > > >> @@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
> > > >>    WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
> > > >>
> > > >>    return inode;
> > > >> +
> > > >> +out:
> > > >> +  iput(inode);
> > > >> +
> > > >> +  return ERR_PTR(err);
> > > >>  }
> > > >>
> > > >>  static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
> > > >> @@ -654,6 +769,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
> > > >>    if (kvm_arch_vm_supports_gmem_shared_mem(kvm))
> > > >>            valid_flags |= GUEST_MEMFD_FLAG_SUPPORT_SHARED;
> > > >>
> > > >> +  if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED)
> > > >> +          valid_flags |= GUEST_MEMFD_FLAG_INIT_PRIVATE;
> > > >> +
> > > >>    if (flags & ~valid_flags)
> > > >>            return -EINVAL;
> > > >>
> > > >> @@ -842,6 +960,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > >>    if (!file)
> > > >>            return -EFAULT;
> > > >>
> > > >> +  filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
> > > >> +
> > > >
> > > > I like the idea of using a write-lock/read-lock to protect write/read access
> > > > to shareability state (though maybe not necessarily re-using filemap's
> > > > invalidate lock), it's simple and still allows concurrent faulting in of gmem
> > > > pages. One issue on the SNP side (which also came up in one of the gmem calls)
> > > > is if we introduce support for tracking preparedness as discussed (e.g. via a
> > > > new SHAREABILITY_GUEST_PREPARED state) the
> > > > SHAREABILITY_GUEST->SHAREABILITY_GUEST_PREPARED transition would occur at
> > > > fault-time, and so would need to take the write-lock and no longer allow for
> > > > concurrent fault-handling.
> > > >
> > > > I was originally planning on introducing a new rw_semaphore with similar
> > > > semantics to the rw_lock that Fuad previously had in his restricted mmap
> > > > series[1] (and simiar semantics to filemap invalidate lock here). The main
> > > > difference, to handle setting SHAREABILITY_GUEST_PREPARED within fault paths,
> > > > was that in the case of a folio being present for an index, the folio lock would
> > > > also need to be held in order to update the shareability state. Because
> > > > of that, fault paths (which will always either have or allocate folio
> > > > basically) can rely on the folio lock to guard shareability state in a more
> > > > granular way and so can avoid a global write lock.
> > > >
> > > > They would still need to hold the read lock to access the tree however.
> > > > Or more specifically, any paths that could allocate a folio need to take
> > > > a read lock so there isn't a TOCTOU situation where shareability is
> > > > being updated for an index for which a folio hasn't been allocated, but
> > > > then just afterward the folio gets faulted in/allocated while the
> > > > shareability state is already being updated which the understand that
> > > > there was no folio around that needed locking.
> > > >
> > > > I had a branch with in-place conversion support for SNP[2] that added this
> > > > lock reworking on top of Fuad's series along with preparation tracking,
> > > > but I'm now planning to rebase that on top of the patches from this
> > > > series that Sean mentioned[3] earlier:
> > > >
> > > >   KVM: guest_memfd: Add CAP KVM_CAP_GMEM_CONVERSION
> > > >   KVM: Query guest_memfd for private/shared status
> > > >   KVM: guest_memfd: Skip LRU for guest_memfd folios
> > > >   KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
> > > >   KVM: guest_memfd: Introduce and use shareability to guard faulting
> > > >   KVM: guest_memfd: Make guest mem use guest mem inodes instead of anonymous inodes
> > > >
> > > > but figured I'd mention it here in case there are other things to consider on
> > > > the locking front.
> > > >
> > > > Definitely agree with Sean though that it would be nice to start identifying a
> > > > common base of patches for the in-place conversion enablement for SNP, TDX, and
> > > > pKVM so the APIs/interfaces for hugepages can be handled separately.
> > > >
> > > > -Mike
> > > >
> > > > [1] https://lore.kernel.org/kvm/20250328153133.3504118-1-tabba@google.com/
> > > > [2] https://github.com/mdroth/linux/commits/mmap-swprot-v10-snp0-wip2/
> > > > [3] https://lore.kernel.org/kvm/aC86OsU2HSFZkJP6@google.com/
> > > >
> > > >>    folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> > > >>    if (IS_ERR(folio)) {
> > > >>            r = PTR_ERR(folio);
> > > >> @@ -857,8 +977,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > >>            *page = folio_file_page(folio, index);
> > > >>    else
> > > >>            folio_put(folio);
> > > >> -
> > > >>  out:
> > > >> +  filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
> > > >>    fput(file);
> > > >>    return r;
> > > >>  }
> > > >> --
> > > >> 2.49.0.1045.g170613ef41-goog
> > > >>
> > >

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Fuad Tabba 5 months, 4 weeks ago

Hi,

On Thu, 3 Jul 2025 at 05:12, Michael Roth <michael.roth@amd.com> wrote:
>
> On Wed, Jul 02, 2025 at 05:46:23PM -0700, Vishal Annapurve wrote:
> > On Wed, Jul 2, 2025 at 4:25 PM Michael Roth <michael.roth@amd.com> wrote:
> > >
> > > On Wed, Jun 11, 2025 at 02:51:38PM -0700, Ackerley Tng wrote:
> > > > Michael Roth <michael.roth@amd.com> writes:
> > > >
> > > > > On Wed, May 14, 2025 at 04:41:41PM -0700, Ackerley Tng wrote:
> > > > >> Track guest_memfd memory's shareability status within the inode as
> > > > >> opposed to the file, since it is property of the guest_memfd's memory
> > > > >> contents.
> > > > >>
> > > > >> Shareability is a property of the memory and is indexed using the
> > > > >> page's index in the inode. Because shareability is the memory's
> > > > >> property, it is stored within guest_memfd instead of within KVM, like
> > > > >> in kvm->mem_attr_array.
> > > > >>
> > > > >> KVM_MEMORY_ATTRIBUTE_PRIVATE in kvm->mem_attr_array must still be
> > > > >> retained to allow VMs to only use guest_memfd for private memory and
> > > > >> some other memory for shared memory.
> > > > >>
> > > > >> Not all use cases require guest_memfd() to be shared with the host
> > > > >> when first created. Add a new flag, GUEST_MEMFD_FLAG_INIT_PRIVATE,
> > > > >> which when set on KVM_CREATE_GUEST_MEMFD, initializes the memory as
> > > > >> private to the guest, and therefore not mappable by the
> > > > >> host. Otherwise, memory is shared until explicitly converted to
> > > > >> private.
> > > > >>
> > > > >> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> > > > >> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> > > > >> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> > > > >> Co-developed-by: Fuad Tabba <tabba@google.com>
> > > > >> Signed-off-by: Fuad Tabba <tabba@google.com>
> > > > >> Change-Id: If03609cbab3ad1564685c85bdba6dcbb6b240c0f
> > > > >> ---
> > > > >>  Documentation/virt/kvm/api.rst |   5 ++
> > > > >>  include/uapi/linux/kvm.h       |   2 +
> > > > >>  virt/kvm/guest_memfd.c         | 124 ++++++++++++++++++++++++++++++++-
> > > > >>  3 files changed, 129 insertions(+), 2 deletions(-)
> > > > >>
> > > > >> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> > > > >> index 86f74ce7f12a..f609337ae1c2 100644
> > > > >> --- a/Documentation/virt/kvm/api.rst
> > > > >> +++ b/Documentation/virt/kvm/api.rst
> > > > >> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
> > > > >>  The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
> > > > >>  This is validated when the guest_memfd instance is bound to the VM.
> > > > >>
> > > > >> +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
> > > > >> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.  Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
> > > > >> +will initialize the memory for the guest_memfd as guest-only and not faultable
> > > > >> +by the host.
> > > > >> +
> > > > >
> > > > > KVM_CAP_GMEM_CONVERSION doesn't get introduced until later, so it seems
> > > > > like this flag should be deferred until that patch is in place. Is it
> > > > > really needed at that point though? Userspace would be able to set the
> > > > > initial state via KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls.
> > > > >
> > > >
> > > > I can move this change to the later patch. Thanks! Will fix in the next
> > > > revision.
> > > >
> > > > > The mtree contents seems to get stored in the same manner in either case so
> > > > > performance-wise only the overhead of a few userspace<->kernel switches
> > > > > would be saved. Are there any other reasons?
> > > > >
> > > > > Otherwise, maybe just settle on SHARED as a documented default (since at
> > > > > least non-CoCo VMs would be able to reliably benefit) and let
> > > > > CoCo/GUEST_MEMFD_FLAG_SUPPORT_SHARED VMs set PRIVATE at whatever
> > > > > granularity makes sense for the architecture/guest configuration.
> > > > >
> > > >
> > > > Because shared pages are split once any memory is allocated, having a
> > > > way to INIT_PRIVATE could avoid the split and then merge on
> > > > conversion. I feel that is enough value to have this config flag, what
> > > > do you think?
> > > >
> > > > I guess we could also have userspace be careful not to do any allocation
> > > > before converting.
>
> (Re-visiting this with the assumption that we *don't* intend to use mmap() to
> populate memory (in which case you can pretty much ignore my previous
> response))
>
> I'm still not sure where the INIT_PRIVATE flag comes into play. For SNP,
> userspace already defaults to marking everything private pretty close to
> guest_memfd creation time, so the potential for allocations to occur
> in-between seems small, but worth confirming.
>
> But I know in the past there was a desire to ensure TDX/SNP could
> support pre-allocating guest_memfd memory (and even pre-faulting via
> KVM_PRE_FAULT_MEMORY), but I think that could still work right? The
> fallocate() handling could still avoid the split if the whole hugepage
> is private, though there is a bit more potential for that fallocate()
> to happen before userspace does the "manually" shared->private
> conversion. I'll double-check on that aspect, but otherwise, is there
> still any other need for it?

It's not just about performance. I think that the need is more a
matter of having a consistent API with the hypervisors guest_memfd is
going to support. Memory in guest_memfd is shared by default, but in
pKVM for example, it's private by default. Therefore, it would be good
to have a way to ensure that all guest_memfd allocations can be made
private from the get-go.

Cheers,
/fuad

> > >
> > > I assume we do want to support things like preallocating guest memory so
> > > not sure this approach is feasible to avoid splits.
> > >
> > > But I feel like we might be working around a deeper issue here, which is
> > > that we are pre-emptively splitting anything that *could* be mapped into
> > > userspace (i.e. allocated+shared/mixed), rather than splitting when
> > > necessary.
> > >
> > > I know that was the plan laid out in the guest_memfd calls, but I've run
> > > into a couple instances that have me thinking we should revisit this.
> > >
> > > 1) Some of the recent guest_memfd seems to be gravitating towards having
> > >    userspace populate/initialize guest memory payload prior to boot via
> > >    mmap()'ing the shared guest_memfd pages so things work the same as
> > >    they would for initialized normal VM memory payload (rather than
> > >    relying on back-channels in the kernel to user data into guest_memfd
> > >    pages).
> > >
> > >    When you do this though, for an SNP guest at least, that memory
> > >    acceptance is done in chunks of 4MB (with accept_memory=lazy), and
> > >    because that will put each 1GB page into an allocated+mixed state,
> >
> > I would like your help in understanding why we need to start
> > guest_memfd ranges as shared for SNP guests. guest_memfd ranges being
> > private simply should mean that certain ranges are not faultable by
> > the userspace.
>
> It's seeming like I probably misremembered, but I thought there was a
> discussion on guest_memfd call a month (or so?) ago about whether to
> continue to use backchannels to populate guest_memfd pages prior to
> launch. It was in the context of whether to keep using kvm_gmem_populate()
> for populating guest_memfd pages by copying them in from separate
> userspace buffer vs. simply populating them directly from userspace.
> I thought we were leaning on the latter since it was simpler all-around,
> which is great for SNP since that is already how it populates memory: by
> writing to it from userspace, which kvm_gmem_populate() then copies into
> guest_memfd pages. With shared gmem support, we just skip the latter now
> in the kernel rather needing changes to how userspace handles things in
> that regard. But maybe that was just wishful thinking :)
>
> But you raise some very compelling points on why this might not be a
> good idea even if that was how that discussion went.
>
> >
> > Will following work?
> > 1) Userspace starts all guest_memfd ranges as private.
> > 2) During early guest boot it starts issuing PSC requests for
> > converting memory from shared to private
> >     -> KVM forwards this request to userspace
> >     -> Userspace checks that the pages are already private and simply
> > does nothing.
> > 3) Pvalidate from guest on that memory will result in guest_memfd
> > offset query which will cause the RMP table entries to actually get
> > populated.
>
> That would work, but there will need to be changes on userspace to deal
> with how SNP populates memory pre-boot just like normal VMs do. We will
> instead need to copy that data into separate buffers, and pass those in
> as the buffer hva instead of the shared hva corresponding to that GPA.
>
> But that seems reasonable if it avoids so many other problems.
>
> >
> > >    we end up splitting every 1GB to 4K and the guest can't even
> > >    accept/PVALIDATE it 2MB at that point even if userspace doesn't touch
> > >    anything in the range. As some point the guest will convert/accept
> > >    the entire range, at which point we could merge, but for SNP we'd
> > >    need guest cooperation to actually use a higher-granularity in stage2
> > >    page tables at that point since RMP entries are effectively all split
> > >    to 4K.
> > >
> > >    I understand the intent is to default to private where this wouldn't
> > >    be an issue, and we could punt to userspace to deal with it, but it
> > >    feels like an artificial restriction to place on userspace. And if we
> > >    do want to allow/expect guest_memfd contents to be initialized pre-boot
> > >    just like normal memory, then userspace would need to jump through
> > >    some hoops:
> > >
> > >    - if defaulting to private: add hooks to convert each range that's being
> > >      modified to a shared state prior to writing to it
> >
> > Why is that a problem?
>
> These were only problems if we went the above-mentioned way of
> populating memory pre-boot via mmap() instead of other backchannels. If
> we don't do that, then both these things cease to be problems. Sounds goods
> to me. :)
>
> >
> > >    - if defaulting to shared: initialize memory in-place, then covert
> > >      everything else to private to avoid unecessarily splitting folios
> > >      at run-time
> > >
> > >    It feels like implementations details are bleeding out into the API
> > >    to some degree here (e.g. we'd probably at least need to document
> > >    this so users know how to take proper advantage of hugepage support).
> >
> > Does it make sense to keep the default behavior as INIT_PRIVATE for
> > SNP VMs always even without using hugepages?
>
> Yes!
>
> Though, revisiting discussion around INIT_PRIVATE (without the baggage
> of potentially relying on mmap() to populate memory), I'm still not sure why
> it's needed. I responded in the context of Ackerley's initial reply
> above.
>
> >
> > >
> > > 2) There are some use-cases for HugeTLB + CoCo that have come to my
> > >    attention recently that put a lot of weight on still being able to
> > >    maximize mapping/hugepage size when accessing shared mem from userspace,
> > >    e.g. for certain DPDK workloads that accessed shared guest buffers
> > >    from host userspace. We don't really have a story for this, and I
> > >    wouldn't expect us to at this stage, but I think it ties into #1 so
> > >    might be worth considering in that context.
> >
> > Major problem I see here is that if anything in the kernel does a GUP
> > on shared memory ranges (which is very likely to happen), it would be
> > difficult to get them to let go of the whole hugepage before it can be
> > split safely.
> >
> > Another problem is guest_memfd today doesn't support management of
> > large user space page table mappings, this can turnout to be
> > significant work to do referring to hugetlb pagetable management
> > logic.
>
> Yah that was more line-of-sight that might be possible by going this
> route, but the refcount'ing issue above is a showstopper as always. I'd
> somehow convinced myself that supporting fine-grained splitting somehow
> worked around it, but you still have no idea what page you need to avoid
> converting and fancy splitting doesn't get you past that. More wishful
> thinking. =\
>
> Thanks,
>
> Mike
>
> >
> > >
> > > I'm still fine with the current approach as a starting point, but I'm
> > > wondering if improving both #1/#2 might not be so bad and maybe even
> > > give us some more flexibility (for instance, Sean had mentioned leaving
> > > open the option of tracking more than just shareability/mappability, and
> > > if there is split/merge logic associated with those transitions then
> > > re-scanning each of these attributes for a 1G range seems like it could
> > > benefit from some sort of intermediate data structure to help determine
> > > things like what mapping granularity is available for guest/userspace
> > > for a particular range.
> > >
> > > One approach I was thinking of was that we introduce a data structure
> > > similar to KVM's memslot->arch.lpage_info() where we store information
> > > about what 1G/2M ranges are shared/private/mixed, and then instead of
> > > splitting ahead of time we just record that state into this data
> > > structure (using the same write lock as with the
> > > shareability/mappability state), and then at *fault* time we split the
> > > folio if our lpage_info-like data structure says the range is mixed.
> > >
> > > Then, if guest converts a 2M/4M range to private while lazilly-accepting
> > > (for instance), we can still keep the folio intact as 1GB, but mark
> > > the 1G range in the lpage_info-like data structure as mixed so that we
> > > still inform KVM/etc. they need to map it as 2MB or lower in stage2
> > > page tables. In that case, even at guest fault-time, we can leave the
> > > folio unsplit until userspace tries to touch it (though in most cases
> > > it never will and we can keep most of the guest's 1G intact for the
> > > duration of its lifetime).
> > >
> > > On the userspace side, another nice thing there is if we see 1G is in a
> > > mixed state, but 2M is all-shared, then we can still leave the folio as 2M,
> > > and I think the refcount'ing logic would still work for the most part,
> > > which makes #2 a bit easier to implement as well.
> > >
> > > And of course, we wouldn't need the INIT_PRIVATE then since we are only
> > > splitting when necessary.
> > >
> > > But I guess this all comes down to how much extra pain there is in
> > > tracking a 1G folio that's been split into a mixed of 2MB/4K regions,
> > > but I think we'd get a lot more mileage out of getting that working and
> > > just completely stripping out all of the merging logic for initial
> > > implementation (other than at cleanup time), so maybe complexity-wise
> > > it balances out a bit?
> > >
> > > Thanks,
> > >
> > > Mike
> > >
> > > >
> > > > >>  See KVM_SET_USER_MEMORY_REGION2 for additional details.
> > > > >>
> > > > >>  4.143 KVM_PRE_FAULT_MEMORY
> > > > >> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > > > >> index 4cc824a3a7c9..d7df312479aa 100644
> > > > >> --- a/include/uapi/linux/kvm.h
> > > > >> +++ b/include/uapi/linux/kvm.h
> > > > >> @@ -1567,7 +1567,9 @@ struct kvm_memory_attributes {
> > > > >>  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
> > > > >>
> > > > >>  #define KVM_CREATE_GUEST_MEMFD    _IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
> > > > >> +
> > > > >>  #define GUEST_MEMFD_FLAG_SUPPORT_SHARED   (1UL << 0)
> > > > >> +#define GUEST_MEMFD_FLAG_INIT_PRIVATE     (1UL << 1)
> > > > >>
> > > > >>  struct kvm_create_guest_memfd {
> > > > >>    __u64 size;
> > > > >> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> > > > >> index 239d0f13dcc1..590932499eba 100644
> > > > >> --- a/virt/kvm/guest_memfd.c
> > > > >> +++ b/virt/kvm/guest_memfd.c
> > > > >> @@ -4,6 +4,7 @@
> > > > >>  #include <linux/falloc.h>
> > > > >>  #include <linux/fs.h>
> > > > >>  #include <linux/kvm_host.h>
> > > > >> +#include <linux/maple_tree.h>
> > > > >>  #include <linux/pseudo_fs.h>
> > > > >>  #include <linux/pagemap.h>
> > > > >>
> > > > >> @@ -17,6 +18,24 @@ struct kvm_gmem {
> > > > >>    struct list_head entry;
> > > > >>  };
> > > > >>
> > > > >> +struct kvm_gmem_inode_private {
> > > > >> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> > > > >> +  struct maple_tree shareability;
> > > > >> +#endif
> > > > >> +};
> > > > >> +
> > > > >> +enum shareability {
> > > > >> +  SHAREABILITY_GUEST = 1, /* Only the guest can map (fault) folios in this range. */
> > > > >> +  SHAREABILITY_ALL = 2,   /* Both guest and host can fault folios in this range. */
> > > > >> +};
> > > > >> +
> > > > >> +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
> > > > >> +
> > > > >> +static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
> > > > >> +{
> > > > >> +  return inode->i_mapping->i_private_data;
> > > > >> +}
> > > > >> +
> > > > >>  /**
> > > > >>   * folio_file_pfn - like folio_file_page, but return a pfn.
> > > > >>   * @folio: The folio which contains this index.
> > > > >> @@ -29,6 +48,58 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
> > > > >>    return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
> > > > >>  }
> > > > >>
> > > > >> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> > > > >> +
> > > > >> +static int kvm_gmem_shareability_setup(struct kvm_gmem_inode_private *private,
> > > > >> +                                loff_t size, u64 flags)
> > > > >> +{
> > > > >> +  enum shareability m;
> > > > >> +  pgoff_t last;
> > > > >> +
> > > > >> +  last = (size >> PAGE_SHIFT) - 1;
> > > > >> +  m = flags & GUEST_MEMFD_FLAG_INIT_PRIVATE ? SHAREABILITY_GUEST :
> > > > >> +                                              SHAREABILITY_ALL;
> > > > >> +  return mtree_store_range(&private->shareability, 0, last, xa_mk_value(m),
> > > > >> +                           GFP_KERNEL);
> > > > >
> > > > > One really nice thing about using a maple tree is that it should get rid
> > > > > of a fairly significant startup delay for SNP/TDX when the entire xarray gets
> > > > > initialized with private attribute entries via KVM_SET_MEMORY_ATTRIBUTES
> > > > > (which is the current QEMU default behavior).
> > > > >
> > > > > I'd originally advocated for sticking with the xarray implementation Fuad was
> > > > > using until we'd determined we really need it for HugeTLB support, but I'm
> > > > > sort of thinking it's already justified just based on the above.
> > > > >
> > > > > Maybe it would make sense for KVM memory attributes too?
> > > > >
> > > > >> +}
> > > > >> +
> > > > >> +static enum shareability kvm_gmem_shareability_get(struct inode *inode,
> > > > >> +                                           pgoff_t index)
> > > > >> +{
> > > > >> +  struct maple_tree *mt;
> > > > >> +  void *entry;
> > > > >> +
> > > > >> +  mt = &kvm_gmem_private(inode)->shareability;
> > > > >> +  entry = mtree_load(mt, index);
> > > > >> +  WARN(!entry,
> > > > >> +       "Shareability should always be defined for all indices in inode.");
> > > > >> +
> > > > >> +  return xa_to_value(entry);
> > > > >> +}
> > > > >> +
> > > > >> +static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> > > > >> +{
> > > > >> +  if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
> > > > >> +          return ERR_PTR(-EACCES);
> > > > >> +
> > > > >> +  return kvm_gmem_get_folio(inode, index);
> > > > >> +}
> > > > >> +
> > > > >> +#else
> > > > >> +
> > > > >> +static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
> > > > >> +{
> > > > >> +  return 0;
> > > > >> +}
> > > > >> +
> > > > >> +static inline struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> > > > >> +{
> > > > >> +  WARN_ONCE("Unexpected call to get shared folio.")
> > > > >> +  return NULL;
> > > > >> +}
> > > > >> +
> > > > >> +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
> > > > >> +
> > > > >>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > > >>                                pgoff_t index, struct folio *folio)
> > > > >>  {
> > > > >> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
> > > > >>
> > > > >>    filemap_invalidate_lock_shared(inode->i_mapping);
> > > > >>
> > > > >> -  folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> > > > >> +  folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
> > > > >>    if (IS_ERR(folio)) {
> > > > >>            int err = PTR_ERR(folio);
> > > > >>
> > > > >> @@ -420,8 +491,33 @@ static struct file_operations kvm_gmem_fops = {
> > > > >>    .fallocate      = kvm_gmem_fallocate,
> > > > >>  };
> > > > >>
> > > > >> +static void kvm_gmem_free_inode(struct inode *inode)
> > > > >> +{
> > > > >> +  struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> > > > >> +
> > > > >> +  kfree(private);
> > > > >> +
> > > > >> +  free_inode_nonrcu(inode);
> > > > >> +}
> > > > >> +
> > > > >> +static void kvm_gmem_destroy_inode(struct inode *inode)
> > > > >> +{
> > > > >> +  struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> > > > >> +
> > > > >> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> > > > >> +  /*
> > > > >> +   * mtree_destroy() can't be used within rcu callback, hence can't be
> > > > >> +   * done in ->free_inode().
> > > > >> +   */
> > > > >> +  if (private)
> > > > >> +          mtree_destroy(&private->shareability);
> > > > >> +#endif
> > > > >> +}
> > > > >> +
> > > > >>  static const struct super_operations kvm_gmem_super_operations = {
> > > > >>    .statfs         = simple_statfs,
> > > > >> +  .destroy_inode  = kvm_gmem_destroy_inode,
> > > > >> +  .free_inode     = kvm_gmem_free_inode,
> > > > >>  };
> > > > >>
> > > > >>  static int kvm_gmem_init_fs_context(struct fs_context *fc)
> > > > >> @@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
> > > > >>  static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
> > > > >>                                                  loff_t size, u64 flags)
> > > > >>  {
> > > > >> +  struct kvm_gmem_inode_private *private;
> > > > >>    struct inode *inode;
> > > > >> +  int err;
> > > > >>
> > > > >>    inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
> > > > >>    if (IS_ERR(inode))
> > > > >>            return inode;
> > > > >>
> > > > >> +  err = -ENOMEM;
> > > > >> +  private = kzalloc(sizeof(*private), GFP_KERNEL);
> > > > >> +  if (!private)
> > > > >> +          goto out;
> > > > >> +
> > > > >> +  mt_init(&private->shareability);
> > > > >> +  inode->i_mapping->i_private_data = private;
> > > > >> +
> > > > >> +  err = kvm_gmem_shareability_setup(private, size, flags);
> > > > >> +  if (err)
> > > > >> +          goto out;
> > > > >> +
> > > > >>    inode->i_private = (void *)(unsigned long)flags;
> > > > >>    inode->i_op = &kvm_gmem_iops;
> > > > >>    inode->i_mapping->a_ops = &kvm_gmem_aops;
> > > > >> @@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
> > > > >>    WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
> > > > >>
> > > > >>    return inode;
> > > > >> +
> > > > >> +out:
> > > > >> +  iput(inode);
> > > > >> +
> > > > >> +  return ERR_PTR(err);
> > > > >>  }
> > > > >>
> > > > >>  static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
> > > > >> @@ -654,6 +769,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
> > > > >>    if (kvm_arch_vm_supports_gmem_shared_mem(kvm))
> > > > >>            valid_flags |= GUEST_MEMFD_FLAG_SUPPORT_SHARED;
> > > > >>
> > > > >> +  if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED)
> > > > >> +          valid_flags |= GUEST_MEMFD_FLAG_INIT_PRIVATE;
> > > > >> +
> > > > >>    if (flags & ~valid_flags)
> > > > >>            return -EINVAL;
> > > > >>
> > > > >> @@ -842,6 +960,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > > >>    if (!file)
> > > > >>            return -EFAULT;
> > > > >>
> > > > >> +  filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
> > > > >> +
> > > > >
> > > > > I like the idea of using a write-lock/read-lock to protect write/read access
> > > > > to shareability state (though maybe not necessarily re-using filemap's
> > > > > invalidate lock), it's simple and still allows concurrent faulting in of gmem
> > > > > pages. One issue on the SNP side (which also came up in one of the gmem calls)
> > > > > is if we introduce support for tracking preparedness as discussed (e.g. via a
> > > > > new SHAREABILITY_GUEST_PREPARED state) the
> > > > > SHAREABILITY_GUEST->SHAREABILITY_GUEST_PREPARED transition would occur at
> > > > > fault-time, and so would need to take the write-lock and no longer allow for
> > > > > concurrent fault-handling.
> > > > >
> > > > > I was originally planning on introducing a new rw_semaphore with similar
> > > > > semantics to the rw_lock that Fuad previously had in his restricted mmap
> > > > > series[1] (and simiar semantics to filemap invalidate lock here). The main
> > > > > difference, to handle setting SHAREABILITY_GUEST_PREPARED within fault paths,
> > > > > was that in the case of a folio being present for an index, the folio lock would
> > > > > also need to be held in order to update the shareability state. Because
> > > > > of that, fault paths (which will always either have or allocate folio
> > > > > basically) can rely on the folio lock to guard shareability state in a more
> > > > > granular way and so can avoid a global write lock.
> > > > >
> > > > > They would still need to hold the read lock to access the tree however.
> > > > > Or more specifically, any paths that could allocate a folio need to take
> > > > > a read lock so there isn't a TOCTOU situation where shareability is
> > > > > being updated for an index for which a folio hasn't been allocated, but
> > > > > then just afterward the folio gets faulted in/allocated while the
> > > > > shareability state is already being updated which the understand that
> > > > > there was no folio around that needed locking.
> > > > >
> > > > > I had a branch with in-place conversion support for SNP[2] that added this
> > > > > lock reworking on top of Fuad's series along with preparation tracking,
> > > > > but I'm now planning to rebase that on top of the patches from this
> > > > > series that Sean mentioned[3] earlier:
> > > > >
> > > > >   KVM: guest_memfd: Add CAP KVM_CAP_GMEM_CONVERSION
> > > > >   KVM: Query guest_memfd for private/shared status
> > > > >   KVM: guest_memfd: Skip LRU for guest_memfd folios
> > > > >   KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
> > > > >   KVM: guest_memfd: Introduce and use shareability to guard faulting
> > > > >   KVM: guest_memfd: Make guest mem use guest mem inodes instead of anonymous inodes
> > > > >
> > > > > but figured I'd mention it here in case there are other things to consider on
> > > > > the locking front.
> > > > >
> > > > > Definitely agree with Sean though that it would be nice to start identifying a
> > > > > common base of patches for the in-place conversion enablement for SNP, TDX, and
> > > > > pKVM so the APIs/interfaces for hugepages can be handled separately.
> > > > >
> > > > > -Mike
> > > > >
> > > > > [1] https://lore.kernel.org/kvm/20250328153133.3504118-1-tabba@google.com/
> > > > > [2] https://github.com/mdroth/linux/commits/mmap-swprot-v10-snp0-wip2/
> > > > > [3] https://lore.kernel.org/kvm/aC86OsU2HSFZkJP6@google.com/
> > > > >
> > > > >>    folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> > > > >>    if (IS_ERR(folio)) {
> > > > >>            r = PTR_ERR(folio);
> > > > >> @@ -857,8 +977,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > > >>            *page = folio_file_page(folio, index);
> > > > >>    else
> > > > >>            folio_put(folio);
> > > > >> -
> > > > >>  out:
> > > > >> +  filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
> > > > >>    fput(file);
> > > > >>    return r;
> > > > >>  }
> > > > >> --
> > > > >> 2.49.0.1045.g170613ef41-goog
> > > > >>
> > > >

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ira Weiny 5 months, 4 weeks ago

Fuad Tabba wrote:
> Hi,
> 
> On Thu, 3 Jul 2025 at 05:12, Michael Roth <michael.roth@amd.com> wrote:
> >
> > On Wed, Jul 02, 2025 at 05:46:23PM -0700, Vishal Annapurve wrote:
> > > On Wed, Jul 2, 2025 at 4:25 PM Michael Roth <michael.roth@amd.com> wrote:
> > > >
> > > > On Wed, Jun 11, 2025 at 02:51:38PM -0700, Ackerley Tng wrote:
> > > > > Michael Roth <michael.roth@amd.com> writes:
> > > > >
> > > > > > On Wed, May 14, 2025 at 04:41:41PM -0700, Ackerley Tng wrote:

[snip]

> > > > > > The mtree contents seems to get stored in the same manner in either case so
> > > > > > performance-wise only the overhead of a few userspace<->kernel switches
> > > > > > would be saved. Are there any other reasons?
> > > > > >
> > > > > > Otherwise, maybe just settle on SHARED as a documented default (since at
> > > > > > least non-CoCo VMs would be able to reliably benefit) and let
> > > > > > CoCo/GUEST_MEMFD_FLAG_SUPPORT_SHARED VMs set PRIVATE at whatever
> > > > > > granularity makes sense for the architecture/guest configuration.
> > > > > >
> > > > >
> > > > > Because shared pages are split once any memory is allocated, having a
> > > > > way to INIT_PRIVATE could avoid the split and then merge on
> > > > > conversion. I feel that is enough value to have this config flag, what
> > > > > do you think?
> > > > >
> > > > > I guess we could also have userspace be careful not to do any allocation
> > > > > before converting.
> >
> > (Re-visiting this with the assumption that we *don't* intend to use mmap() to
> > populate memory (in which case you can pretty much ignore my previous
> > response))
> >
> > I'm still not sure where the INIT_PRIVATE flag comes into play. For SNP,
> > userspace already defaults to marking everything private pretty close to
> > guest_memfd creation time, so the potential for allocations to occur
> > in-between seems small, but worth confirming.
> >
> > But I know in the past there was a desire to ensure TDX/SNP could
> > support pre-allocating guest_memfd memory (and even pre-faulting via
> > KVM_PRE_FAULT_MEMORY), but I think that could still work right? The
> > fallocate() handling could still avoid the split if the whole hugepage
> > is private, though there is a bit more potential for that fallocate()
> > to happen before userspace does the "manually" shared->private
> > conversion. I'll double-check on that aspect, but otherwise, is there
> > still any other need for it?
> 
> It's not just about performance. I think that the need is more a
> matter of having a consistent API with the hypervisors guest_memfd is
> going to support. Memory in guest_memfd is shared by default, but in
> pKVM for example, it's private by default. Therefore, it would be good
  ^^^^^^^^^^^^^^^^
And Coco's as well right?

Ira

> to have a way to ensure that all guest_memfd allocations can be made
> private from the get-go.
> 
> Cheers,
> /fuad
> 

[snip]

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Vishal Annapurve 7 months, 1 week ago

On Wed, Jul 2, 2025 at 9:12 PM Michael Roth <michael.roth@amd.com> wrote:
>
> On Wed, Jul 02, 2025 at 05:46:23PM -0700, Vishal Annapurve wrote:
> > On Wed, Jul 2, 2025 at 4:25 PM Michael Roth <michael.roth@amd.com> wrote:
> > >
> > > On Wed, Jun 11, 2025 at 02:51:38PM -0700, Ackerley Tng wrote:
> > > > Michael Roth <michael.roth@amd.com> writes:
> > > >
> > > > > On Wed, May 14, 2025 at 04:41:41PM -0700, Ackerley Tng wrote:
> > > > >> Track guest_memfd memory's shareability status within the inode as
> > > > >> opposed to the file, since it is property of the guest_memfd's memory
> > > > >> contents.
> > > > >>
> > > > >> Shareability is a property of the memory and is indexed using the
> > > > >> page's index in the inode. Because shareability is the memory's
> > > > >> property, it is stored within guest_memfd instead of within KVM, like
> > > > >> in kvm->mem_attr_array.
> > > > >>
> > > > >> KVM_MEMORY_ATTRIBUTE_PRIVATE in kvm->mem_attr_array must still be
> > > > >> retained to allow VMs to only use guest_memfd for private memory and
> > > > >> some other memory for shared memory.
> > > > >>
> > > > >> Not all use cases require guest_memfd() to be shared with the host
> > > > >> when first created. Add a new flag, GUEST_MEMFD_FLAG_INIT_PRIVATE,
> > > > >> which when set on KVM_CREATE_GUEST_MEMFD, initializes the memory as
> > > > >> private to the guest, and therefore not mappable by the
> > > > >> host. Otherwise, memory is shared until explicitly converted to
> > > > >> private.
> > > > >>
> > > > >> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> > > > >> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> > > > >> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> > > > >> Co-developed-by: Fuad Tabba <tabba@google.com>
> > > > >> Signed-off-by: Fuad Tabba <tabba@google.com>
> > > > >> Change-Id: If03609cbab3ad1564685c85bdba6dcbb6b240c0f
> > > > >> ---
> > > > >>  Documentation/virt/kvm/api.rst |   5 ++
> > > > >>  include/uapi/linux/kvm.h       |   2 +
> > > > >>  virt/kvm/guest_memfd.c         | 124 ++++++++++++++++++++++++++++++++-
> > > > >>  3 files changed, 129 insertions(+), 2 deletions(-)
> > > > >>
> > > > >> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> > > > >> index 86f74ce7f12a..f609337ae1c2 100644
> > > > >> --- a/Documentation/virt/kvm/api.rst
> > > > >> +++ b/Documentation/virt/kvm/api.rst
> > > > >> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
> > > > >>  The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
> > > > >>  This is validated when the guest_memfd instance is bound to the VM.
> > > > >>
> > > > >> +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
> > > > >> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.  Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
> > > > >> +will initialize the memory for the guest_memfd as guest-only and not faultable
> > > > >> +by the host.
> > > > >> +
> > > > >
> > > > > KVM_CAP_GMEM_CONVERSION doesn't get introduced until later, so it seems
> > > > > like this flag should be deferred until that patch is in place. Is it
> > > > > really needed at that point though? Userspace would be able to set the
> > > > > initial state via KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls.
> > > > >
> > > >
> > > > I can move this change to the later patch. Thanks! Will fix in the next
> > > > revision.
> > > >
> > > > > The mtree contents seems to get stored in the same manner in either case so
> > > > > performance-wise only the overhead of a few userspace<->kernel switches
> > > > > would be saved. Are there any other reasons?
> > > > >
> > > > > Otherwise, maybe just settle on SHARED as a documented default (since at
> > > > > least non-CoCo VMs would be able to reliably benefit) and let
> > > > > CoCo/GUEST_MEMFD_FLAG_SUPPORT_SHARED VMs set PRIVATE at whatever
> > > > > granularity makes sense for the architecture/guest configuration.
> > > > >
> > > >
> > > > Because shared pages are split once any memory is allocated, having a
> > > > way to INIT_PRIVATE could avoid the split and then merge on
> > > > conversion. I feel that is enough value to have this config flag, what
> > > > do you think?
> > > >
> > > > I guess we could also have userspace be careful not to do any allocation
> > > > before converting.
>
> (Re-visiting this with the assumption that we *don't* intend to use mmap() to
> populate memory (in which case you can pretty much ignore my previous
> response))

I am assuming in-place conversion with huge page backing for the
discussion below.

Looks like there are three scenarios/usecases we are discussing here:
1) Pre-allocating guest_memfd file offsets
   - Userspace can use fallocate to do this for hugepages by keeping
the file ranges marked private.
2) Prefaulting guest EPT/NPT entries
3) Populating initial guest payload into guest_memfd memory
   - Userspace can mark certain ranges as shared, populate the
contents and convert the ranges back to private. So mmap will come in
handy here.

>
> I'm still not sure where the INIT_PRIVATE flag comes into play. For SNP,
> userspace already defaults to marking everything private pretty close to
> guest_memfd creation time, so the potential for allocations to occur
> in-between seems small, but worth confirming.

Ok, I am not much worried about whether the INIT_PRIVATE flag gets
supported or not, but more about the default setting that different
CVMs start with. To me, it looks like all CVMs should start as
everything private by default and if there is a way to bake that
configuration during guest_memfd creation time that would be good to
have instead of doing "create and convert" operations and there is a
fairly low cost to support this flag.

>
> But I know in the past there was a desire to ensure TDX/SNP could
> support pre-allocating guest_memfd memory (and even pre-faulting via
> KVM_PRE_FAULT_MEMORY), but I think that could still work right? The
> fallocate() handling could still avoid the split if the whole hugepage
> is private, though there is a bit more potential for that fallocate()
> to happen before userspace does the "manually" shared->private
> conversion. I'll double-check on that aspect, but otherwise, is there
> still any other need for it?

This usecase of being able to preallocate should still work with
in-place conversion assuming all ranges are private before
pre-population.

>
> > >
> > > I assume we do want to support things like preallocating guest memory so
> > > not sure this approach is feasible to avoid splits.
> > >
> > > But I feel like we might be working around a deeper issue here, which is
> > > that we are pre-emptively splitting anything that *could* be mapped into
> > > userspace (i.e. allocated+shared/mixed), rather than splitting when
> > > necessary.
> > >
> > > I know that was the plan laid out in the guest_memfd calls, but I've run
> > > into a couple instances that have me thinking we should revisit this.
> > >
> > > 1) Some of the recent guest_memfd seems to be gravitating towards having
> > >    userspace populate/initialize guest memory payload prior to boot via
> > >    mmap()'ing the shared guest_memfd pages so things work the same as
> > >    they would for initialized normal VM memory payload (rather than
> > >    relying on back-channels in the kernel to user data into guest_memfd
> > >    pages).
> > >
> > >    When you do this though, for an SNP guest at least, that memory
> > >    acceptance is done in chunks of 4MB (with accept_memory=lazy), and
> > >    because that will put each 1GB page into an allocated+mixed state,
> >
> > I would like your help in understanding why we need to start
> > guest_memfd ranges as shared for SNP guests. guest_memfd ranges being
> > private simply should mean that certain ranges are not faultable by
> > the userspace.
>
> It's seeming like I probably misremembered, but I thought there was a
> discussion on guest_memfd call a month (or so?) ago about whether to
> continue to use backchannels to populate guest_memfd pages prior to
> launch. It was in the context of whether to keep using kvm_gmem_populate()
> for populating guest_memfd pages by copying them in from separate
> userspace buffer vs. simply populating them directly from userspace.
> I thought we were leaning on the latter since it was simpler all-around,
> which is great for SNP since that is already how it populates memory: by
> writing to it from userspace, which kvm_gmem_populate() then copies into
> guest_memfd pages. With shared gmem support, we just skip the latter now
> in the kernel rather needing changes to how userspace handles things in
> that regard. But maybe that was just wishful thinking :)

You remember it correctly and that's how userspace should pre-populate
guest memory contents with in-place conversion support available.
Userspace can simply do the following scheme as an example:
1) Create guest_memfd with the INIT_PRIVATE flag or if we decide to
not go that way, create a guest_memfd file and set all ranges as
private.
2) Preallocate the guest_memfd ranges.
3) Convert the needed ranges to shared, populate the initial guest
payload and then convert those ranges back to private.

Important point here is that guest_memfd ranges can be marked as
private before pre-allocating guest_memfd ranges.

>
> But you raise some very compelling points on why this might not be a
> good idea even if that was how that discussion went.
>
> >
> > Will following work?
> > 1) Userspace starts all guest_memfd ranges as private.
> > 2) During early guest boot it starts issuing PSC requests for
> > converting memory from shared to private
> >     -> KVM forwards this request to userspace
> >     -> Userspace checks that the pages are already private and simply
> > does nothing.
> > 3) Pvalidate from guest on that memory will result in guest_memfd
> > offset query which will cause the RMP table entries to actually get
> > populated.
>
> That would work, but there will need to be changes on userspace to deal
> with how SNP populates memory pre-boot just like normal VMs do. We will
> instead need to copy that data into separate buffers, and pass those in
> as the buffer hva instead of the shared hva corresponding to that GPA.

Initial guest memory payload generally carries a much smaller
footprint so I ignored that detail in the above sequence. As I said
above, userspace should be able to use guest_memfd ranges to directly
populate contents by converting those ranges to shared.

>
> But that seems reasonable if it avoids so many other problems.
>
> >
> > >    we end up splitting every 1GB to 4K and the guest can't even
> > >    accept/PVALIDATE it 2MB at that point even if userspace doesn't touch
> > >    anything in the range. As some point the guest will convert/accept
> > >    the entire range, at which point we could merge, but for SNP we'd
> > >    need guest cooperation to actually use a higher-granularity in stage2
> > >    page tables at that point since RMP entries are effectively all split
> > >    to 4K.
> > >
> > >    I understand the intent is to default to private where this wouldn't
> > >    be an issue, and we could punt to userspace to deal with it, but it
> > >    feels like an artificial restriction to place on userspace. And if we
> > >    do want to allow/expect guest_memfd contents to be initialized pre-boot
> > >    just like normal memory, then userspace would need to jump through
> > >    some hoops:
> > >
> > >    - if defaulting to private: add hooks to convert each range that's being
> > >      modified to a shared state prior to writing to it
> >
> > Why is that a problem?
>
> These were only problems if we went the above-mentioned way of
> populating memory pre-boot via mmap() instead of other backchannels. If
> we don't do that, then both these things cease to be problems. Sounds goods
> to me. :)

I think there wouldn't be a problem even if we pre-populated memory
pre-boot via mmap(). Using mmap() seems a preferable option to me.

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Michael Roth 7 months, 1 week ago

On Wed, Jul 02, 2025 at 10:10:36PM -0700, Vishal Annapurve wrote:
> On Wed, Jul 2, 2025 at 9:12 PM Michael Roth <michael.roth@amd.com> wrote:
> >
> > On Wed, Jul 02, 2025 at 05:46:23PM -0700, Vishal Annapurve wrote:
> > > On Wed, Jul 2, 2025 at 4:25 PM Michael Roth <michael.roth@amd.com> wrote:
> > > >
> > > > On Wed, Jun 11, 2025 at 02:51:38PM -0700, Ackerley Tng wrote:
> > > > > Michael Roth <michael.roth@amd.com> writes:
> > > > >
> > > > > > On Wed, May 14, 2025 at 04:41:41PM -0700, Ackerley Tng wrote:
> > > > > >> Track guest_memfd memory's shareability status within the inode as
> > > > > >> opposed to the file, since it is property of the guest_memfd's memory
> > > > > >> contents.
> > > > > >>
> > > > > >> Shareability is a property of the memory and is indexed using the
> > > > > >> page's index in the inode. Because shareability is the memory's
> > > > > >> property, it is stored within guest_memfd instead of within KVM, like
> > > > > >> in kvm->mem_attr_array.
> > > > > >>
> > > > > >> KVM_MEMORY_ATTRIBUTE_PRIVATE in kvm->mem_attr_array must still be
> > > > > >> retained to allow VMs to only use guest_memfd for private memory and
> > > > > >> some other memory for shared memory.
> > > > > >>
> > > > > >> Not all use cases require guest_memfd() to be shared with the host
> > > > > >> when first created. Add a new flag, GUEST_MEMFD_FLAG_INIT_PRIVATE,
> > > > > >> which when set on KVM_CREATE_GUEST_MEMFD, initializes the memory as
> > > > > >> private to the guest, and therefore not mappable by the
> > > > > >> host. Otherwise, memory is shared until explicitly converted to
> > > > > >> private.
> > > > > >>
> > > > > >> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> > > > > >> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> > > > > >> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> > > > > >> Co-developed-by: Fuad Tabba <tabba@google.com>
> > > > > >> Signed-off-by: Fuad Tabba <tabba@google.com>
> > > > > >> Change-Id: If03609cbab3ad1564685c85bdba6dcbb6b240c0f
> > > > > >> ---
> > > > > >>  Documentation/virt/kvm/api.rst |   5 ++
> > > > > >>  include/uapi/linux/kvm.h       |   2 +
> > > > > >>  virt/kvm/guest_memfd.c         | 124 ++++++++++++++++++++++++++++++++-
> > > > > >>  3 files changed, 129 insertions(+), 2 deletions(-)
> > > > > >>
> > > > > >> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> > > > > >> index 86f74ce7f12a..f609337ae1c2 100644
> > > > > >> --- a/Documentation/virt/kvm/api.rst
> > > > > >> +++ b/Documentation/virt/kvm/api.rst
> > > > > >> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
> > > > > >>  The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
> > > > > >>  This is validated when the guest_memfd instance is bound to the VM.
> > > > > >>
> > > > > >> +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
> > > > > >> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.  Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
> > > > > >> +will initialize the memory for the guest_memfd as guest-only and not faultable
> > > > > >> +by the host.
> > > > > >> +
> > > > > >
> > > > > > KVM_CAP_GMEM_CONVERSION doesn't get introduced until later, so it seems
> > > > > > like this flag should be deferred until that patch is in place. Is it
> > > > > > really needed at that point though? Userspace would be able to set the
> > > > > > initial state via KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls.
> > > > > >
> > > > >
> > > > > I can move this change to the later patch. Thanks! Will fix in the next
> > > > > revision.
> > > > >
> > > > > > The mtree contents seems to get stored in the same manner in either case so
> > > > > > performance-wise only the overhead of a few userspace<->kernel switches
> > > > > > would be saved. Are there any other reasons?
> > > > > >
> > > > > > Otherwise, maybe just settle on SHARED as a documented default (since at
> > > > > > least non-CoCo VMs would be able to reliably benefit) and let
> > > > > > CoCo/GUEST_MEMFD_FLAG_SUPPORT_SHARED VMs set PRIVATE at whatever
> > > > > > granularity makes sense for the architecture/guest configuration.
> > > > > >
> > > > >
> > > > > Because shared pages are split once any memory is allocated, having a
> > > > > way to INIT_PRIVATE could avoid the split and then merge on
> > > > > conversion. I feel that is enough value to have this config flag, what
> > > > > do you think?
> > > > >
> > > > > I guess we could also have userspace be careful not to do any allocation
> > > > > before converting.
> >
> > (Re-visiting this with the assumption that we *don't* intend to use mmap() to
> > populate memory (in which case you can pretty much ignore my previous
> > response))
> 
> I am assuming in-place conversion with huge page backing for the
> discussion below.
> 
> Looks like there are three scenarios/usecases we are discussing here:
> 1) Pre-allocating guest_memfd file offsets
>    - Userspace can use fallocate to do this for hugepages by keeping
> the file ranges marked private.
> 2) Prefaulting guest EPT/NPT entries
> 3) Populating initial guest payload into guest_memfd memory
>    - Userspace can mark certain ranges as shared, populate the
> contents and convert the ranges back to private. So mmap will come in
> handy here.
> 
> >
> > I'm still not sure where the INIT_PRIVATE flag comes into play. For SNP,
> > userspace already defaults to marking everything private pretty close to
> > guest_memfd creation time, so the potential for allocations to occur
> > in-between seems small, but worth confirming.
> 
> Ok, I am not much worried about whether the INIT_PRIVATE flag gets
> supported or not, but more about the default setting that different
> CVMs start with. To me, it looks like all CVMs should start as
> everything private by default and if there is a way to bake that
> configuration during guest_memfd creation time that would be good to
> have instead of doing "create and convert" operations and there is a
> fairly low cost to support this flag.
> 
> >
> > But I know in the past there was a desire to ensure TDX/SNP could
> > support pre-allocating guest_memfd memory (and even pre-faulting via
> > KVM_PRE_FAULT_MEMORY), but I think that could still work right? The
> > fallocate() handling could still avoid the split if the whole hugepage
> > is private, though there is a bit more potential for that fallocate()
> > to happen before userspace does the "manually" shared->private
> > conversion. I'll double-check on that aspect, but otherwise, is there
> > still any other need for it?
> 
> This usecase of being able to preallocate should still work with
> in-place conversion assuming all ranges are private before
> pre-population.

Ok, I think I was missing that the merge logic here will then restore it
to 1GB before the guest starts, so the folio isn't permanently split if
we do the mmap() and that gives us more flexibility on how we can use
it.

I was thinking we needed to avoid the split from the start by avoiding
paths like mmap() which might trigger the split. I was trying to avoid
any merge->unsplit logic in the THP case (or unsplit in general), in
which case we'd get permanent splits via the mmap() approach, but for
2MB that's probably not a big deal.

> 
> >
> > > >
> > > > I assume we do want to support things like preallocating guest memory so
> > > > not sure this approach is feasible to avoid splits.
> > > >
> > > > But I feel like we might be working around a deeper issue here, which is
> > > > that we are pre-emptively splitting anything that *could* be mapped into
> > > > userspace (i.e. allocated+shared/mixed), rather than splitting when
> > > > necessary.
> > > >
> > > > I know that was the plan laid out in the guest_memfd calls, but I've run
> > > > into a couple instances that have me thinking we should revisit this.
> > > >
> > > > 1) Some of the recent guest_memfd seems to be gravitating towards having
> > > >    userspace populate/initialize guest memory payload prior to boot via
> > > >    mmap()'ing the shared guest_memfd pages so things work the same as
> > > >    they would for initialized normal VM memory payload (rather than
> > > >    relying on back-channels in the kernel to user data into guest_memfd
> > > >    pages).
> > > >
> > > >    When you do this though, for an SNP guest at least, that memory
> > > >    acceptance is done in chunks of 4MB (with accept_memory=lazy), and
> > > >    because that will put each 1GB page into an allocated+mixed state,
> > >
> > > I would like your help in understanding why we need to start
> > > guest_memfd ranges as shared for SNP guests. guest_memfd ranges being
> > > private simply should mean that certain ranges are not faultable by
> > > the userspace.
> >
> > It's seeming like I probably misremembered, but I thought there was a
> > discussion on guest_memfd call a month (or so?) ago about whether to
> > continue to use backchannels to populate guest_memfd pages prior to
> > launch. It was in the context of whether to keep using kvm_gmem_populate()
> > for populating guest_memfd pages by copying them in from separate
> > userspace buffer vs. simply populating them directly from userspace.
> > I thought we were leaning on the latter since it was simpler all-around,
> > which is great for SNP since that is already how it populates memory: by
> > writing to it from userspace, which kvm_gmem_populate() then copies into
> > guest_memfd pages. With shared gmem support, we just skip the latter now
> > in the kernel rather needing changes to how userspace handles things in
> > that regard. But maybe that was just wishful thinking :)
> 
> You remember it correctly and that's how userspace should pre-populate
> guest memory contents with in-place conversion support available.
> Userspace can simply do the following scheme as an example:
> 1) Create guest_memfd with the INIT_PRIVATE flag or if we decide to
> not go that way, create a guest_memfd file and set all ranges as
> private.
> 2) Preallocate the guest_memfd ranges.
> 3) Convert the needed ranges to shared, populate the initial guest
> payload and then convert those ranges back to private.
> 
> Important point here is that guest_memfd ranges can be marked as
> private before pre-allocating guest_memfd ranges.

Got it, and then the merge logic triggers so you get the 1GB back before
guest launch. That seems reasonable. I was only thinking of the merge
logic in the context of a running guest and it didn't seem all that useful
in that regard, but it makes perfect sense for the above sort of scenario.

Thanks,

Mike

> 
> >
> > But you raise some very compelling points on why this might not be a
> > good idea even if that was how that discussion went.
> >
> > >
> > > Will following work?
> > > 1) Userspace starts all guest_memfd ranges as private.
> > > 2) During early guest boot it starts issuing PSC requests for
> > > converting memory from shared to private
> > >     -> KVM forwards this request to userspace
> > >     -> Userspace checks that the pages are already private and simply
> > > does nothing.
> > > 3) Pvalidate from guest on that memory will result in guest_memfd
> > > offset query which will cause the RMP table entries to actually get
> > > populated.
> >
> > That would work, but there will need to be changes on userspace to deal
> > with how SNP populates memory pre-boot just like normal VMs do. We will
> > instead need to copy that data into separate buffers, and pass those in
> > as the buffer hva instead of the shared hva corresponding to that GPA.
> 
> Initial guest memory payload generally carries a much smaller
> footprint so I ignored that detail in the above sequence. As I said
> above, userspace should be able to use guest_memfd ranges to directly
> populate contents by converting those ranges to shared.
> 
> >
> > But that seems reasonable if it avoids so many other problems.
> >
> > >
> > > >    we end up splitting every 1GB to 4K and the guest can't even
> > > >    accept/PVALIDATE it 2MB at that point even if userspace doesn't touch
> > > >    anything in the range. As some point the guest will convert/accept
> > > >    the entire range, at which point we could merge, but for SNP we'd
> > > >    need guest cooperation to actually use a higher-granularity in stage2
> > > >    page tables at that point since RMP entries are effectively all split
> > > >    to 4K.
> > > >
> > > >    I understand the intent is to default to private where this wouldn't
> > > >    be an issue, and we could punt to userspace to deal with it, but it
> > > >    feels like an artificial restriction to place on userspace. And if we
> > > >    do want to allow/expect guest_memfd contents to be initialized pre-boot
> > > >    just like normal memory, then userspace would need to jump through
> > > >    some hoops:
> > > >
> > > >    - if defaulting to private: add hooks to convert each range that's being
> > > >      modified to a shared state prior to writing to it
> > >
> > > Why is that a problem?
> >
> > These were only problems if we went the above-mentioned way of
> > populating memory pre-boot via mmap() instead of other backchannels. If
> > we don't do that, then both these things cease to be problems. Sounds goods
> > to me. :)
> 
> I think there wouldn't be a problem even if we pre-populated memory
> pre-boot via mmap(). Using mmap() seems a preferable option to me.
>

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Vishal Annapurve 7 months ago

On Thu, Jul 3, 2025 at 1:41 PM Michael Roth <michael.roth@amd.com> wrote:
> > > > > >
> > > > > > Because shared pages are split once any memory is allocated, having a
> > > > > > way to INIT_PRIVATE could avoid the split and then merge on
> > > > > > conversion. I feel that is enough value to have this config flag, what
> > > > > > do you think?
> > > > > >
> > > > > > I guess we could also have userspace be careful not to do any allocation
> > > > > > before converting.
> > >
> > > (Re-visiting this with the assumption that we *don't* intend to use mmap() to
> > > populate memory (in which case you can pretty much ignore my previous
> > > response))
> >
> > I am assuming in-place conversion with huge page backing for the
> > discussion below.
> >
> > Looks like there are three scenarios/usecases we are discussing here:
> > 1) Pre-allocating guest_memfd file offsets
> >    - Userspace can use fallocate to do this for hugepages by keeping
> > the file ranges marked private.
> > 2) Prefaulting guest EPT/NPT entries
> > 3) Populating initial guest payload into guest_memfd memory
> >    - Userspace can mark certain ranges as shared, populate the
> > contents and convert the ranges back to private. So mmap will come in
> > handy here.
> >
> > >
> > > I'm still not sure where the INIT_PRIVATE flag comes into play. For SNP,
> > > userspace already defaults to marking everything private pretty close to
> > > guest_memfd creation time, so the potential for allocations to occur
> > > in-between seems small, but worth confirming.
> >
> > Ok, I am not much worried about whether the INIT_PRIVATE flag gets
> > supported or not, but more about the default setting that different
> > CVMs start with. To me, it looks like all CVMs should start as
> > everything private by default and if there is a way to bake that
> > configuration during guest_memfd creation time that would be good to
> > have instead of doing "create and convert" operations and there is a
> > fairly low cost to support this flag.
> >
> > >
> > > But I know in the past there was a desire to ensure TDX/SNP could
> > > support pre-allocating guest_memfd memory (and even pre-faulting via
> > > KVM_PRE_FAULT_MEMORY), but I think that could still work right? The
> > > fallocate() handling could still avoid the split if the whole hugepage
> > > is private, though there is a bit more potential for that fallocate()
> > > to happen before userspace does the "manually" shared->private
> > > conversion. I'll double-check on that aspect, but otherwise, is there
> > > still any other need for it?
> >
> > This usecase of being able to preallocate should still work with
> > in-place conversion assuming all ranges are private before
> > pre-population.
>
> Ok, I think I was missing that the merge logic here will then restore it
> to 1GB before the guest starts, so the folio isn't permanently split if
> we do the mmap() and that gives us more flexibility on how we can use
> it.
>
> I was thinking we needed to avoid the split from the start by avoiding
> paths like mmap() which might trigger the split. I was trying to avoid
> any merge->unsplit logic in the THP case (or unsplit in general), in
> which case we'd get permanent splits via the mmap() approach, but for
> 2MB that's probably not a big deal.

After initial payload population, during its runtime guest can cause
different hugepages to get split which can remain split even after
guest converts them back to private. For THP there may not be much
benefit of merging those pages together specially if NPT/EPT entries
can't be promoted back to hugepage mapping and there is no memory
penalty as THP doesn't use HVO.

Wishful thinking on my part: It would be great to figure out a way to
promote these pagetable entries without relying on the guest, if
possible with ABI updates, as I think the host should have some
control over EPT/NPT granularities even for Confidential VMs. Along
the similar lines, it would be great to have "page struct"-less memory
working for Confidential VMs, which should greatly reduce the toil
with merge/split operations and will render the conversions mostly to
be pagetable manipulations.

That being said, memory split and merge seem to be relatively
lightweight for THP (with no memory allocation/freeing) and reusing
the memory files after reboot of the guest VM will require pages to be
merged to start with a clean slate. One possible option is to always
merge as early as possible, second option is to invent a new UAPI to
do it on demand.

For 1G pages, even if we go with 1G -> 2M -> 4K split stages, page
splits result in higher memory usage with HVO around and it becomes
useful to merge them back as early as possible as guest proceeds to
convert subranges of different hugepages over its lifetime. Merging
pages as early as possible also allows reusing of memory files during
the next reboot without having to invent a new UAPI.

Caveats with "merge as early as possible":
- Shared to private conversions will be slower for hugetlb pages.
   * Counter argument: These conversions are already slow as we need
safe refcounts to reach on the ranges getting converted.
- If guests convert a particular range often then extra merge/split
operations will result in overhead.
   * Counter argument: Since conversions are anyways slow, it's
beneficial for guests to avoid such a scenario and keep back and forth
conversions as less frequent as possible.

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Michael Roth 7 months ago

On Mon, Jul 07, 2025 at 07:55:01AM -0700, Vishal Annapurve wrote:
> On Thu, Jul 3, 2025 at 1:41 PM Michael Roth <michael.roth@amd.com> wrote:
> > > > > > >
> > > > > > > Because shared pages are split once any memory is allocated, having a
> > > > > > > way to INIT_PRIVATE could avoid the split and then merge on
> > > > > > > conversion. I feel that is enough value to have this config flag, what
> > > > > > > do you think?
> > > > > > >
> > > > > > > I guess we could also have userspace be careful not to do any allocation
> > > > > > > before converting.
> > > >
> > > > (Re-visiting this with the assumption that we *don't* intend to use mmap() to
> > > > populate memory (in which case you can pretty much ignore my previous
> > > > response))
> > >
> > > I am assuming in-place conversion with huge page backing for the
> > > discussion below.
> > >
> > > Looks like there are three scenarios/usecases we are discussing here:
> > > 1) Pre-allocating guest_memfd file offsets
> > >    - Userspace can use fallocate to do this for hugepages by keeping
> > > the file ranges marked private.
> > > 2) Prefaulting guest EPT/NPT entries
> > > 3) Populating initial guest payload into guest_memfd memory
> > >    - Userspace can mark certain ranges as shared, populate the
> > > contents and convert the ranges back to private. So mmap will come in
> > > handy here.
> > >
> > > >
> > > > I'm still not sure where the INIT_PRIVATE flag comes into play. For SNP,
> > > > userspace already defaults to marking everything private pretty close to
> > > > guest_memfd creation time, so the potential for allocations to occur
> > > > in-between seems small, but worth confirming.
> > >
> > > Ok, I am not much worried about whether the INIT_PRIVATE flag gets
> > > supported or not, but more about the default setting that different
> > > CVMs start with. To me, it looks like all CVMs should start as
> > > everything private by default and if there is a way to bake that
> > > configuration during guest_memfd creation time that would be good to
> > > have instead of doing "create and convert" operations and there is a
> > > fairly low cost to support this flag.
> > >
> > > >
> > > > But I know in the past there was a desire to ensure TDX/SNP could
> > > > support pre-allocating guest_memfd memory (and even pre-faulting via
> > > > KVM_PRE_FAULT_MEMORY), but I think that could still work right? The
> > > > fallocate() handling could still avoid the split if the whole hugepage
> > > > is private, though there is a bit more potential for that fallocate()
> > > > to happen before userspace does the "manually" shared->private
> > > > conversion. I'll double-check on that aspect, but otherwise, is there
> > > > still any other need for it?
> > >
> > > This usecase of being able to preallocate should still work with
> > > in-place conversion assuming all ranges are private before
> > > pre-population.
> >
> > Ok, I think I was missing that the merge logic here will then restore it
> > to 1GB before the guest starts, so the folio isn't permanently split if
> > we do the mmap() and that gives us more flexibility on how we can use
> > it.
> >
> > I was thinking we needed to avoid the split from the start by avoiding
> > paths like mmap() which might trigger the split. I was trying to avoid
> > any merge->unsplit logic in the THP case (or unsplit in general), in
> > which case we'd get permanent splits via the mmap() approach, but for
> > 2MB that's probably not a big deal.
> 
> After initial payload population, during its runtime guest can cause
> different hugepages to get split which can remain split even after
> guest converts them back to private. For THP there may not be much
> benefit of merging those pages together specially if NPT/EPT entries
> can't be promoted back to hugepage mapping and there is no memory
> penalty as THP doesn't use HVO.
> 
> Wishful thinking on my part: It would be great to figure out a way to
> promote these pagetable entries without relying on the guest, if
> possible with ABI updates, as I think the host should have some
> control over EPT/NPT granularities even for Confidential VMs. Along

I'm not sure how much it would buy us. For example, for a 2MB hugetlb
SNP guest boot with 16GB of memory I see 622 2MB hugepages getting
split, but only about 30 or so of those get merged back to 2MB folios
during guest run-time. These are presumably the set of 2MB regions we
could promote back up, but it's not much given that we wouldn't expect
that value to grow proportionally for larger guests: it's really
separate things like the number of vCPUs (for shared GHCB pages), number
of virtio buffers, etc. that end up determining the upper bound on how
many pages might get split due to 4K private->shared conversion, and
these would vary all that much from get to get outside maybe vCPU
count.

For 1GB hugetlb I see about 6 1GB pages get split, and only 2 get merged
during run-time and would be candidates for promotion.

This could be greatly improved from the guest side by using
higher-order allocations to create pools of shared memory that could
then be used to reduce the number of splits caused by doing
private->shared conversions on random ranges of malloc'd memory,
and this could be done even without special promotion support on the
host for pretty much the entirety of guest memory. The idea there would
be to just making optimized guests avoid the splits completely, rather
than relying on the limited subset that hardware can optimize without
guest cooperation.

> the similar lines, it would be great to have "page struct"-less memory
> working for Confidential VMs, which should greatly reduce the toil
> with merge/split operations and will render the conversions mostly to
> be pagetable manipulations.

FWIW, I did some profiling of split/merge vs. overall conversion time
(by that I mean all cycles spent within kvm_gmem_convert_execute_work()),
and while split/merge does take quite a few more cycles than your
average conversion operation (~100x more), the total cycles spent
splitting/merging ended up being about 7% of the total cycles spent
handling conversions (1043938460 cycles in this case).

For 1GB, a split/merge take >1000x more than a normal conversion
operation (46475980 cycles vs 320 in this sample), but it's probably 
still not too bad vs the overall conversion path, and as mentioned above
it only happens about 6x for 16GB SNP guest so I don't think split/merge
overhead is a huge deal for current guests, especially if we work toward
optimizing guest-side usage of shared memory in the future. (There is
potential for this to crater performance for a very poorly-optimized
guest however but I think the guest should bear some burden for that
sort of thing: e.g. flipping the same page back-and-forth between
shared/private vs. caching it for continued usage as shared page in the
guest driver path isn't something we should put too much effort into
optimizing.)

> 
> That being said, memory split and merge seem to be relatively
> lightweight for THP (with no memory allocation/freeing) and reusing
> the memory files after reboot of the guest VM will require pages to be
> merged to start with a clean slate. One possible option is to always
> merge as early as possible, second option is to invent a new UAPI to
> do it on demand.
> 
> For 1G pages, even if we go with 1G -> 2M -> 4K split stages, page
> splits result in higher memory usage with HVO around and it becomes
> useful to merge them back as early as possible as guest proceeds to
> convert subranges of different hugepages over its lifetime. Merging
> pages as early as possible also allows reusing of memory files during
> the next reboot without having to invent a new UAPI.
> 
> Caveats with "merge as early as possible":
> - Shared to private conversions will be slower for hugetlb pages.
>    * Counter argument: These conversions are already slow as we need
> safe refcounts to reach on the ranges getting converted.
> - If guests convert a particular range often then extra merge/split
> operations will result in overhead.
>    * Counter argument: Since conversions are anyways slow, it's
> beneficial for guests to avoid such a scenario and keep back and forth
> conversions as less frequent as possible.

Fair enough. I'm not seeing any major reason not to do things this way,
as the overhead doesn't seem to be very significant for the common case.

(even though, as noted above, the amount of hugetlb pages we actually end
up merging at guest run-time seems to be fairly small, but maybe there
are scenarios where this will have a bigger impact, and it certainly helps
to have it there for the pre-boot merges.)

-Mike

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Vishal Annapurve 7 months ago

On Fri, Jul 11, 2025 at 5:11 PM Michael Roth <michael.roth@amd.com> wrote:
> >
> > Wishful thinking on my part: It would be great to figure out a way to
> > promote these pagetable entries without relying on the guest, if
> > possible with ABI updates, as I think the host should have some
> > control over EPT/NPT granularities even for Confidential VMs. Along
>
> I'm not sure how much it would buy us. For example, for a 2MB hugetlb
> SNP guest boot with 16GB of memory I see 622 2MB hugepages getting
> split, but only about 30 or so of those get merged back to 2MB folios
> during guest run-time. These are presumably the set of 2MB regions we
> could promote back up, but it's not much given that we wouldn't expect
> that value to grow proportionally for larger guests: it's really
> separate things like the number of vCPUs (for shared GHCB pages), number
> of virtio buffers, etc. that end up determining the upper bound on how
> many pages might get split due to 4K private->shared conversion, and
> these would vary all that much from get to get outside maybe vCPU
> count.
>
> For 1GB hugetlb I see about 6 1GB pages get split, and only 2 get merged
> during run-time and would be candidates for promotion.
>

Thanks for the great analysis here. I think we will need to repeat
such analysis for other scenarios such as usage with accelerators.

> This could be greatly improved from the guest side by using
> higher-order allocations to create pools of shared memory that could
> then be used to reduce the number of splits caused by doing
> private->shared conversions on random ranges of malloc'd memory,
> and this could be done even without special promotion support on the
> host for pretty much the entirety of guest memory. The idea there would
> be to just making optimized guests avoid the splits completely, rather
> than relying on the limited subset that hardware can optimize without
> guest cooperation.

Yes, it would be great to improve the situation from the guest side,
e.g. I tried with a rough draft [1], the conclusion there was that we
need to set aside "enough" guest memory as CMA to cause all the DMA go
through 2M aligned buffers. It's hard to figure out how much is
"enough", but we could start somewhere. That being said, the host
still has to manage memory this way by splitting/merging at runtime
because I don't think it's possible to enforce all conversions to
happen at 2M (or any at 1G) granularity. So it's also very likely that
even if guests do significant chunk of conversions at hugepage
granularity, host still needs to split pages all the way to 4K for all
shared regions unless we can bake another restriction in the
conversion ABI that guests can only convert the same ranges to private
as were converted before to shared.

[1] https://lore.kernel.org/lkml/20240112055251.36101-1-vannapurve@google.com/

>
> > the similar lines, it would be great to have "page struct"-less memory
> > working for Confidential VMs, which should greatly reduce the toil
> > with merge/split operations and will render the conversions mostly to
> > be pagetable manipulations.
>
> FWIW, I did some profiling of split/merge vs. overall conversion time
> (by that I mean all cycles spent within kvm_gmem_convert_execute_work()),
> and while split/merge does take quite a few more cycles than your
> average conversion operation (~100x more), the total cycles spent
> splitting/merging ended up being about 7% of the total cycles spent
> handling conversions (1043938460 cycles in this case).
>
> For 1GB, a split/merge take >1000x more than a normal conversion
> operation (46475980 cycles vs 320 in this sample), but it's probably
> still not too bad vs the overall conversion path, and as mentioned above
> it only happens about 6x for 16GB SNP guest so I don't think split/merge
> overhead is a huge deal for current guests, especially if we work toward
> optimizing guest-side usage of shared memory in the future. (There is
> potential for this to crater performance for a very poorly-optimized
> guest however but I think the guest should bear some burden for that
> sort of thing: e.g. flipping the same page back-and-forth between
> shared/private vs. caching it for continued usage as shared page in the
> guest driver path isn't something we should put too much effort into
> optimizing.)
>

As per discussions in the past, guest_memfd private pages are simply
only managed by guest_memfd. We don't need and effectively don't want
the kernel to manage guest private memory. So effectively we can get
rid of page structs in theory just for private pages as well and
allocate page structs only for shared memory on conversion and
deallocate on conversion back to private.

And when we have base core-mm allocators that hand out raw pfns to
start with, we don't even need shared memory ranges to be backed by
page structs.

Few hurdles we need to cross:
1) Invent a new filemap equivalent that maps guest_memfd offsets to pfns
2) Modify TDX EPT management to work with pfns and not page structs
3) Modify generic KVM NPT/EPT management logic to work with pfns and
not rely on page structs
4) Modify memory error/hwpoison handling to route all memory errors on
such pfns to guest_memfd.

I believe there are obvious benefits (reduced complexity, reduced
memory footprint etc) if we go this route and we are very likely to go
this route for future usecases even if we decide to live with
conversion costs today.

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Vishal Annapurve 7 months, 1 week ago

On Wed, Jul 2, 2025 at 5:46 PM Vishal Annapurve <vannapurve@google.com> wrote:
> ...
> >
> > 2) There are some use-cases for HugeTLB + CoCo that have come to my
> >    attention recently that put a lot of weight on still being able to
> >    maximize mapping/hugepage size when accessing shared mem from userspace,
> >    e.g. for certain DPDK workloads that accessed shared guest buffers
> >    from host userspace. We don't really have a story for this, and I
> >    wouldn't expect us to at this stage, but I think it ties into #1 so
> >    might be worth considering in that context.
>
> Major problem I see here is that if anything in the kernel does a GUP
> on shared memory ranges (which is very likely to happen), it would be
> difficult to get them to let go of the whole hugepage before it can be
> split safely.

The scenario I was alluding to here:
guest trying to convert a subpage from a shared range backed by
hugepage to private.

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Binbin Wu 8 months, 2 weeks ago


On 5/15/2025 7:41 AM, Ackerley Tng wrote:
> Track guest_memfd memory's shareability status within the inode as
> opposed to the file, since it is property of the guest_memfd's memory
> contents.
>
> Shareability is a property of the memory and is indexed using the
> page's index in the inode. Because shareability is the memory's
> property, it is stored within guest_memfd instead of within KVM, like
> in kvm->mem_attr_array.
>
> KVM_MEMORY_ATTRIBUTE_PRIVATE in kvm->mem_attr_array must still be
> retained to allow VMs to only use guest_memfd for private memory and
> some other memory for shared memory.
>
> Not all use cases require guest_memfd() to be shared with the host
> when first created. Add a new flag, GUEST_MEMFD_FLAG_INIT_PRIVATE,
> which when set on KVM_CREATE_GUEST_MEMFD, initializes the memory as
> private to the guest, and therefore not mappable by the
> host. Otherwise, memory is shared until explicitly converted to
> private.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> Co-developed-by: Fuad Tabba <tabba@google.com>
> Signed-off-by: Fuad Tabba <tabba@google.com>
> Change-Id: If03609cbab3ad1564685c85bdba6dcbb6b240c0f
> ---
>   Documentation/virt/kvm/api.rst |   5 ++
>   include/uapi/linux/kvm.h       |   2 +
>   virt/kvm/guest_memfd.c         | 124 ++++++++++++++++++++++++++++++++-
>   3 files changed, 129 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 86f74ce7f12a..f609337ae1c2 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
>   The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
>   This is validated when the guest_memfd instance is bound to the VM.
>   
> +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.

It seems that the sentence is stale?
Didn't find the definition of KVM_CAP_GMEM_CONVERSIONS.

> Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
> +will initialize the memory for the guest_memfd as guest-only and not faultable
> +by the host.
> +
[...]
>   
>   static int kvm_gmem_init_fs_context(struct fs_context *fc)
> @@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
>   static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>   						      loff_t size, u64 flags)
>   {
> +	struct kvm_gmem_inode_private *private;
>   	struct inode *inode;
> +	int err;
>   
>   	inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
>   	if (IS_ERR(inode))
>   		return inode;
>   
> +	err = -ENOMEM;
> +	private = kzalloc(sizeof(*private), GFP_KERNEL);
> +	if (!private)
> +		goto out;
> +
> +	mt_init(&private->shareability);

shareability is defined only when CONFIG_KVM_GMEM_SHARED_MEM enabled, should be done within CONFIG_KVM_GMEM_SHARED_MEM .


> +	inode->i_mapping->i_private_data = private;
> +
> +	err = kvm_gmem_shareability_setup(private, size, flags);
> +	if (err)
> +		goto out;
> +
>   	inode->i_private = (void *)(unsigned long)flags;
>   	inode->i_op = &kvm_gmem_iops;
>   	inode->i_mapping->a_ops = &kvm_gmem_aops;
> @@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>   	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
>   
>   	return inode;
> +
> +out:
> +	iput(inode);
> +
> +	return ERR_PTR(err);
>   }
>   
>
[...]

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ackerley Tng 8 months, 2 weeks ago

Binbin Wu <binbin.wu@linux.intel.com> writes:

> On 5/15/2025 7:41 AM, Ackerley Tng wrote:
>> Track guest_memfd memory's shareability status within the inode as
>> opposed to the file, since it is property of the guest_memfd's memory
>> contents.
>>
>> Shareability is a property of the memory and is indexed using the
>> page's index in the inode. Because shareability is the memory's
>> property, it is stored within guest_memfd instead of within KVM, like
>> in kvm->mem_attr_array.
>>
>> KVM_MEMORY_ATTRIBUTE_PRIVATE in kvm->mem_attr_array must still be
>> retained to allow VMs to only use guest_memfd for private memory and
>> some other memory for shared memory.
>>
>> Not all use cases require guest_memfd() to be shared with the host
>> when first created. Add a new flag, GUEST_MEMFD_FLAG_INIT_PRIVATE,
>> which when set on KVM_CREATE_GUEST_MEMFD, initializes the memory as
>> private to the guest, and therefore not mappable by the
>> host. Otherwise, memory is shared until explicitly converted to
>> private.
>>
>> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
>> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
>> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
>> Co-developed-by: Fuad Tabba <tabba@google.com>
>> Signed-off-by: Fuad Tabba <tabba@google.com>
>> Change-Id: If03609cbab3ad1564685c85bdba6dcbb6b240c0f
>> ---
>>   Documentation/virt/kvm/api.rst |   5 ++
>>   include/uapi/linux/kvm.h       |   2 +
>>   virt/kvm/guest_memfd.c         | 124 ++++++++++++++++++++++++++++++++-
>>   3 files changed, 129 insertions(+), 2 deletions(-)
>>
>> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
>> index 86f74ce7f12a..f609337ae1c2 100644
>> --- a/Documentation/virt/kvm/api.rst
>> +++ b/Documentation/virt/kvm/api.rst
>> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
>>   The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
>>   This is validated when the guest_memfd instance is bound to the VM.
>>   
>> +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
>> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.
>
> It seems that the sentence is stale?
> Didn't find the definition of KVM_CAP_GMEM_CONVERSIONS.
>

Thanks. This should read

If the capability KVM_CAP_GMEM_SHARED_MEM is supported, and
GUEST_MEMFD_FLAG_SUPPORT_SHARED is specified, then the 'flags' field
supports GUEST_MEMFD_FLAG_INIT_PRIVATE.

>> Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
>> +will initialize the memory for the guest_memfd as guest-only and not faultable
>> +by the host.
>> +
> [...]
>>   
>>   static int kvm_gmem_init_fs_context(struct fs_context *fc)
>> @@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
>>   static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>>   						      loff_t size, u64 flags)
>>   {
>> +	struct kvm_gmem_inode_private *private;
>>   	struct inode *inode;
>> +	int err;
>>   
>>   	inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
>>   	if (IS_ERR(inode))
>>   		return inode;
>>   
>> +	err = -ENOMEM;
>> +	private = kzalloc(sizeof(*private), GFP_KERNEL);
>> +	if (!private)
>> +		goto out;
>> +
>> +	mt_init(&private->shareability);
>
> shareability is defined only when CONFIG_KVM_GMEM_SHARED_MEM enabled, should be done within CONFIG_KVM_GMEM_SHARED_MEM .
>
>

Yes, thank you! Will also update this to only initialize shareability if
(flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED).

>> +	inode->i_mapping->i_private_data = private;
>> +
>> +	err = kvm_gmem_shareability_setup(private, size, flags);
>> +	if (err)
>> +		goto out;
>> +
>>   	inode->i_private = (void *)(unsigned long)flags;
>>   	inode->i_op = &kvm_gmem_iops;
>>   	inode->i_mapping->a_ops = &kvm_gmem_aops;
>> @@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>>   	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
>>   
>>   	return inode;
>> +
>> +out:
>> +	iput(inode);
>> +
>> +	return ERR_PTR(err);
>>   }
>>   
>>
> [...]

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ackerley Tng 8 months, 2 weeks ago

Ackerley Tng <ackerleytng@google.com> writes:

> Binbin Wu <binbin.wu@linux.intel.com> writes:
>
>> On 5/15/2025 7:41 AM, Ackerley Tng wrote:
>>> Track guest_memfd memory's shareability status within the inode as
>>> opposed to the file, since it is property of the guest_memfd's memory
>>> contents.
>>>
>>> Shareability is a property of the memory and is indexed using the
>>> page's index in the inode. Because shareability is the memory's
>>> property, it is stored within guest_memfd instead of within KVM, like
>>> in kvm->mem_attr_array.
>>>
>>> KVM_MEMORY_ATTRIBUTE_PRIVATE in kvm->mem_attr_array must still be
>>> retained to allow VMs to only use guest_memfd for private memory and
>>> some other memory for shared memory.
>>>
>>> Not all use cases require guest_memfd() to be shared with the host
>>> when first created. Add a new flag, GUEST_MEMFD_FLAG_INIT_PRIVATE,
>>> which when set on KVM_CREATE_GUEST_MEMFD, initializes the memory as
>>> private to the guest, and therefore not mappable by the
>>> host. Otherwise, memory is shared until explicitly converted to
>>> private.
>>>
>>> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
>>> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
>>> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
>>> Co-developed-by: Fuad Tabba <tabba@google.com>
>>> Signed-off-by: Fuad Tabba <tabba@google.com>
>>> Change-Id: If03609cbab3ad1564685c85bdba6dcbb6b240c0f
>>> ---
>>>   Documentation/virt/kvm/api.rst |   5 ++
>>>   include/uapi/linux/kvm.h       |   2 +
>>>   virt/kvm/guest_memfd.c         | 124 ++++++++++++++++++++++++++++++++-
>>>   3 files changed, 129 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
>>> index 86f74ce7f12a..f609337ae1c2 100644
>>> --- a/Documentation/virt/kvm/api.rst
>>> +++ b/Documentation/virt/kvm/api.rst
>>> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
>>>   The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
>>>   This is validated when the guest_memfd instance is bound to the VM.
>>>   
>>> +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
>>> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.
>>
>> It seems that the sentence is stale?
>> Didn't find the definition of KVM_CAP_GMEM_CONVERSIONS.
>>
>
> Thanks. This should read
>
> If the capability KVM_CAP_GMEM_SHARED_MEM is supported, and
> GUEST_MEMFD_FLAG_SUPPORT_SHARED is specified, then the 'flags' field
> supports GUEST_MEMFD_FLAG_INIT_PRIVATE.
>

My bad, saw your other email. Fixing the above:

If the capability KVM_CAP_GMEM_CONVERSION is supported, and
GUEST_MEMFD_FLAG_SUPPORT_SHARED is specified, then the 'flags' field
supports GUEST_MEMFD_FLAG_INIT_PRIVATE.

>>> Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
>>> +will initialize the memory for the guest_memfd as guest-only and not faultable
>>> +by the host.
>>> +
>> [...]
>>>   
>>>   static int kvm_gmem_init_fs_context(struct fs_context *fc)
>>> @@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
>>>   static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>>>   						      loff_t size, u64 flags)
>>>   {
>>> +	struct kvm_gmem_inode_private *private;
>>>   	struct inode *inode;
>>> +	int err;
>>>   
>>>   	inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
>>>   	if (IS_ERR(inode))
>>>   		return inode;
>>>   
>>> +	err = -ENOMEM;
>>> +	private = kzalloc(sizeof(*private), GFP_KERNEL);
>>> +	if (!private)
>>> +		goto out;
>>> +
>>> +	mt_init(&private->shareability);
>>
>> shareability is defined only when CONFIG_KVM_GMEM_SHARED_MEM enabled, should be done within CONFIG_KVM_GMEM_SHARED_MEM .
>>
>>
>
> Yes, thank you! Will also update this to only initialize shareability if
> (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED).
>
>>> +	inode->i_mapping->i_private_data = private;
>>> +
>>> +	err = kvm_gmem_shareability_setup(private, size, flags);
>>> +	if (err)
>>> +		goto out;
>>> +
>>>   	inode->i_private = (void *)(unsigned long)flags;
>>>   	inode->i_op = &kvm_gmem_iops;
>>>   	inode->i_mapping->a_ops = &kvm_gmem_aops;
>>> @@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>>>   	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
>>>   
>>>   	return inode;
>>> +
>>> +out:
>>> +	iput(inode);
>>> +
>>> +	return ERR_PTR(err);
>>>   }
>>>   
>>>
>> [...]

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Binbin Wu 8 months, 2 weeks ago


On 5/27/2025 4:25 PM, Binbin Wu wrote:
>
>
> On 5/15/2025 7:41 AM, Ackerley Tng wrote:
>> Track guest_memfd memory's shareability status within the inode as
>> opposed to the file, since it is property of the guest_memfd's memory
>> contents.
>>
>> Shareability is a property of the memory and is indexed using the
>> page's index in the inode. Because shareability is the memory's
>> property, it is stored within guest_memfd instead of within KVM, like
>> in kvm->mem_attr_array.
>>
>> KVM_MEMORY_ATTRIBUTE_PRIVATE in kvm->mem_attr_array must still be
>> retained to allow VMs to only use guest_memfd for private memory and
>> some other memory for shared memory.
>>
>> Not all use cases require guest_memfd() to be shared with the host
>> when first created. Add a new flag, GUEST_MEMFD_FLAG_INIT_PRIVATE,
>> which when set on KVM_CREATE_GUEST_MEMFD, initializes the memory as
>> private to the guest, and therefore not mappable by the
>> host. Otherwise, memory is shared until explicitly converted to
>> private.
>>
>> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
>> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
>> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
>> Co-developed-by: Fuad Tabba <tabba@google.com>
>> Signed-off-by: Fuad Tabba <tabba@google.com>
>> Change-Id: If03609cbab3ad1564685c85bdba6dcbb6b240c0f
>> ---
>>   Documentation/virt/kvm/api.rst |   5 ++
>>   include/uapi/linux/kvm.h       |   2 +
>>   virt/kvm/guest_memfd.c         | 124 ++++++++++++++++++++++++++++++++-
>>   3 files changed, 129 insertions(+), 2 deletions(-)
>>
>> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
>> index 86f74ce7f12a..f609337ae1c2 100644
>> --- a/Documentation/virt/kvm/api.rst
>> +++ b/Documentation/virt/kvm/api.rst
>> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
>>   The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
>>   This is validated when the guest_memfd instance is bound to the VM.
>>   +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
>> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.
>
> It seems that the sentence is stale?
> Didn't find the definition of KVM_CAP_GMEM_CONVERSIONS.
Aha! It's a typo, should be KVM_CAP_GMEM_CONVERSION.



>
>> Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
>> +will initialize the memory for the guest_memfd as guest-only and not faultable
>> +by the host.
>> +
> [...]
>>     static int kvm_gmem_init_fs_context(struct fs_context *fc)
>> @@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
>>   static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>>                                 loff_t size, u64 flags)
>>   {
>> +    struct kvm_gmem_inode_private *private;
>>       struct inode *inode;
>> +    int err;
>>         inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
>>       if (IS_ERR(inode))
>>           return inode;
>>   +    err = -ENOMEM;
>> +    private = kzalloc(sizeof(*private), GFP_KERNEL);
>> +    if (!private)
>> +        goto out;
>> +
>> +    mt_init(&private->shareability);
>
> shareability is defined only when CONFIG_KVM_GMEM_SHARED_MEM enabled, should be done within CONFIG_KVM_GMEM_SHARED_MEM .
>
>
>> + inode->i_mapping->i_private_data = private;
>> +
>> +    err = kvm_gmem_shareability_setup(private, size, flags);
>> +    if (err)
>> +        goto out;
>> +
>>       inode->i_private = (void *)(unsigned long)flags;
>>       inode->i_op = &kvm_gmem_iops;
>>       inode->i_mapping->a_ops = &kvm_gmem_aops;
>> @@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>>       WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
>>         return inode;
>> +
>> +out:
>> +    iput(inode);
>> +
>> +    return ERR_PTR(err);
>>   }
>>
> [...]
>

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Yan Zhao 8 months, 2 weeks ago

On Wed, May 14, 2025 at 04:41:41PM -0700, Ackerley Tng wrote:
> Track guest_memfd memory's shareability status within the inode as
> opposed to the file, since it is property of the guest_memfd's memory
> contents.
> 
> Shareability is a property of the memory and is indexed using the
> page's index in the inode. Because shareability is the memory's
> property, it is stored within guest_memfd instead of within KVM, like
> in kvm->mem_attr_array.
> 
> KVM_MEMORY_ATTRIBUTE_PRIVATE in kvm->mem_attr_array must still be
> retained to allow VMs to only use guest_memfd for private memory and
> some other memory for shared memory.
> 
> Not all use cases require guest_memfd() to be shared with the host
> when first created. Add a new flag, GUEST_MEMFD_FLAG_INIT_PRIVATE,
> which when set on KVM_CREATE_GUEST_MEMFD, initializes the memory as
> private to the guest, and therefore not mappable by the
> host. Otherwise, memory is shared until explicitly converted to
> private.
> 
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
> Co-developed-by: Fuad Tabba <tabba@google.com>
> Signed-off-by: Fuad Tabba <tabba@google.com>
> Change-Id: If03609cbab3ad1564685c85bdba6dcbb6b240c0f
> ---
>  Documentation/virt/kvm/api.rst |   5 ++
>  include/uapi/linux/kvm.h       |   2 +
>  virt/kvm/guest_memfd.c         | 124 ++++++++++++++++++++++++++++++++-
>  3 files changed, 129 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 86f74ce7f12a..f609337ae1c2 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
>  The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
>  This is validated when the guest_memfd instance is bound to the VM.
>  
> +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.  Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
> +will initialize the memory for the guest_memfd as guest-only and not faultable
> +by the host.
> +
>  See KVM_SET_USER_MEMORY_REGION2 for additional details.
>  
>  4.143 KVM_PRE_FAULT_MEMORY
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 4cc824a3a7c9..d7df312479aa 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1567,7 +1567,9 @@ struct kvm_memory_attributes {
>  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
>  
>  #define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
> +
>  #define GUEST_MEMFD_FLAG_SUPPORT_SHARED	(1UL << 0)
> +#define GUEST_MEMFD_FLAG_INIT_PRIVATE	(1UL << 1)
>  
>  struct kvm_create_guest_memfd {
>  	__u64 size;
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 239d0f13dcc1..590932499eba 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -4,6 +4,7 @@
>  #include <linux/falloc.h>
>  #include <linux/fs.h>
>  #include <linux/kvm_host.h>
> +#include <linux/maple_tree.h>
>  #include <linux/pseudo_fs.h>
>  #include <linux/pagemap.h>
>  
> @@ -17,6 +18,24 @@ struct kvm_gmem {
>  	struct list_head entry;
>  };
>  
> +struct kvm_gmem_inode_private {
> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> +	struct maple_tree shareability;
> +#endif
> +};
> +
> +enum shareability {
> +	SHAREABILITY_GUEST = 1,	/* Only the guest can map (fault) folios in this range. */
> +	SHAREABILITY_ALL = 2,	/* Both guest and host can fault folios in this range. */
> +};
> +
> +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
> +
> +static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
> +{
> +	return inode->i_mapping->i_private_data;
> +}
> +
>  /**
>   * folio_file_pfn - like folio_file_page, but return a pfn.
>   * @folio: The folio which contains this index.
> @@ -29,6 +48,58 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
>  	return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
>  }
>  
> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> +
> +static int kvm_gmem_shareability_setup(struct kvm_gmem_inode_private *private,
> +				      loff_t size, u64 flags)
> +{
> +	enum shareability m;
> +	pgoff_t last;
> +
> +	last = (size >> PAGE_SHIFT) - 1;
> +	m = flags & GUEST_MEMFD_FLAG_INIT_PRIVATE ? SHAREABILITY_GUEST :
> +						    SHAREABILITY_ALL;
> +	return mtree_store_range(&private->shareability, 0, last, xa_mk_value(m),
> +				 GFP_KERNEL);
> +}
> +
> +static enum shareability kvm_gmem_shareability_get(struct inode *inode,
> +						 pgoff_t index)
> +{
> +	struct maple_tree *mt;
> +	void *entry;
> +
> +	mt = &kvm_gmem_private(inode)->shareability;
> +	entry = mtree_load(mt, index);
> +	WARN(!entry,
> +	     "Shareability should always be defined for all indices in inode.");
I noticed that in [1], the kvm_gmem_mmap() does not check the range.
So, the WARN() here can be hit when userspace mmap() an area larger than the
inode size and accesses the out of band HVA.

Maybe limit the mmap() range?

@@ -1609,6 +1620,10 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
        if (!kvm_gmem_supports_shared(file_inode(file)))
                return -ENODEV;

+       if (vma->vm_end - vma->vm_start + (vma->vm_pgoff << PAGE_SHIFT) > i_size_read(file_inode(file)))
+               return -EINVAL;
+
        if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
            (VM_SHARED | VM_MAYSHARE)) {
                return -EINVAL;

[1] https://lore.kernel.org/all/20250513163438.3942405-8-tabba@google.com/

> +	return xa_to_value(entry);
> +}
> +
> +static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> +{
> +	if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
> +		return ERR_PTR(-EACCES);
> +
> +	return kvm_gmem_get_folio(inode, index);
> +}
> +
> +#else
> +
> +static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
> +{
> +	return 0;
> +}
> +
> +static inline struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> +{
> +	WARN_ONCE("Unexpected call to get shared folio.")
> +	return NULL;
> +}
> +
> +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
> +
>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>  				    pgoff_t index, struct folio *folio)
>  {
> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
>  
>  	filemap_invalidate_lock_shared(inode->i_mapping);
>  
> -	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> +	folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
>  	if (IS_ERR(folio)) {
>  		int err = PTR_ERR(folio);
>  
> @@ -420,8 +491,33 @@ static struct file_operations kvm_gmem_fops = {
>  	.fallocate	= kvm_gmem_fallocate,
>  };
>  
> +static void kvm_gmem_free_inode(struct inode *inode)
> +{
> +	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> +
> +	kfree(private);
> +
> +	free_inode_nonrcu(inode);
> +}
> +
> +static void kvm_gmem_destroy_inode(struct inode *inode)
> +{
> +	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> +
> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> +	/*
> +	 * mtree_destroy() can't be used within rcu callback, hence can't be
> +	 * done in ->free_inode().
> +	 */
> +	if (private)
> +		mtree_destroy(&private->shareability);
> +#endif
> +}
> +
>  static const struct super_operations kvm_gmem_super_operations = {
>  	.statfs		= simple_statfs,
> +	.destroy_inode	= kvm_gmem_destroy_inode,
> +	.free_inode	= kvm_gmem_free_inode,
>  };
>  
>  static int kvm_gmem_init_fs_context(struct fs_context *fc)
> @@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
>  static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>  						      loff_t size, u64 flags)
>  {
> +	struct kvm_gmem_inode_private *private;
>  	struct inode *inode;
> +	int err;
>  
>  	inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
>  	if (IS_ERR(inode))
>  		return inode;
>  
> +	err = -ENOMEM;
> +	private = kzalloc(sizeof(*private), GFP_KERNEL);
> +	if (!private)
> +		goto out;
> +
> +	mt_init(&private->shareability);
Wrap the mt_init() inside "#ifdef CONFIG_KVM_GMEM_SHARED_MEM" ?

> +	inode->i_mapping->i_private_data = private;
> +
> +	err = kvm_gmem_shareability_setup(private, size, flags);
> +	if (err)
> +		goto out;
> +
>  	inode->i_private = (void *)(unsigned long)flags;
>  	inode->i_op = &kvm_gmem_iops;
>  	inode->i_mapping->a_ops = &kvm_gmem_aops;
> @@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>  	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
>  
>  	return inode;
> +
> +out:
> +	iput(inode);
> +
> +	return ERR_PTR(err);
>  }
>  
>  static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
> @@ -654,6 +769,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
>  	if (kvm_arch_vm_supports_gmem_shared_mem(kvm))
>  		valid_flags |= GUEST_MEMFD_FLAG_SUPPORT_SHARED;
>  
> +	if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED)
> +		valid_flags |= GUEST_MEMFD_FLAG_INIT_PRIVATE;
> +
>  	if (flags & ~valid_flags)
>  		return -EINVAL;
>  
> @@ -842,6 +960,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	if (!file)
>  		return -EFAULT;
>  
> +	filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
> +
>  	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
>  	if (IS_ERR(folio)) {
>  		r = PTR_ERR(folio);
> @@ -857,8 +977,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>  		*page = folio_file_page(folio, index);
>  	else
>  		folio_put(folio);
> -
>  out:
> +	filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
>  	fput(file);
>  	return r;
>  }
> -- 
> 2.49.0.1045.g170613ef41-goog
> 
>

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Fuad Tabba 8 months, 1 week ago

Hi,

.. snip..

> I noticed that in [1], the kvm_gmem_mmap() does not check the range.
> So, the WARN() here can be hit when userspace mmap() an area larger than the
> inode size and accesses the out of band HVA.
>
> Maybe limit the mmap() range?
>
> @@ -1609,6 +1620,10 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
>         if (!kvm_gmem_supports_shared(file_inode(file)))
>                 return -ENODEV;
>
> +       if (vma->vm_end - vma->vm_start + (vma->vm_pgoff << PAGE_SHIFT) > i_size_read(file_inode(file)))
> +               return -EINVAL;
> +
>         if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
>             (VM_SHARED | VM_MAYSHARE)) {
>                 return -EINVAL;
>
> [1] https://lore.kernel.org/all/20250513163438.3942405-8-tabba@google.com/

I don't think we want to do that for a couple of reasons. We catch
such invalid accesses on faulting, and, by analogy, afaikt, neither
secretmem nor memfd perform a similar check on mmap (nor do
memory-mapped files in general).

There are also valid reasons why a user would want to deliberately
mmap more memory than the backing store, knowing that it's only going
to fault what it's going to use, e.g., alignment.

Cheers,
/fuad


> > +     return xa_to_value(entry);
> > +}
> > +
> > +static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> > +{
> > +     if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
> > +             return ERR_PTR(-EACCES);
> > +
> > +     return kvm_gmem_get_folio(inode, index);
> > +}
> > +
> > +#else
> > +
> > +static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
> > +{
> > +     return 0;
> > +}
> > +
> > +static inline struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> > +{
> > +     WARN_ONCE("Unexpected call to get shared folio.")
> > +     return NULL;
> > +}
> > +
> > +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
> > +
> >  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> >                                   pgoff_t index, struct folio *folio)
> >  {
> > @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
> >
> >       filemap_invalidate_lock_shared(inode->i_mapping);
> >
> > -     folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> > +     folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
> >       if (IS_ERR(folio)) {
> >               int err = PTR_ERR(folio);
> >
> > @@ -420,8 +491,33 @@ static struct file_operations kvm_gmem_fops = {
> >       .fallocate      = kvm_gmem_fallocate,
> >  };
> >
> > +static void kvm_gmem_free_inode(struct inode *inode)
> > +{
> > +     struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> > +
> > +     kfree(private);
> > +
> > +     free_inode_nonrcu(inode);
> > +}
> > +
> > +static void kvm_gmem_destroy_inode(struct inode *inode)
> > +{
> > +     struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> > +
> > +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> > +     /*
> > +      * mtree_destroy() can't be used within rcu callback, hence can't be
> > +      * done in ->free_inode().
> > +      */
> > +     if (private)
> > +             mtree_destroy(&private->shareability);
> > +#endif
> > +}
> > +
> >  static const struct super_operations kvm_gmem_super_operations = {
> >       .statfs         = simple_statfs,
> > +     .destroy_inode  = kvm_gmem_destroy_inode,
> > +     .free_inode     = kvm_gmem_free_inode,
> >  };
> >
> >  static int kvm_gmem_init_fs_context(struct fs_context *fc)
> > @@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
> >  static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
> >                                                     loff_t size, u64 flags)
> >  {
> > +     struct kvm_gmem_inode_private *private;
> >       struct inode *inode;
> > +     int err;
> >
> >       inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
> >       if (IS_ERR(inode))
> >               return inode;
> >
> > +     err = -ENOMEM;
> > +     private = kzalloc(sizeof(*private), GFP_KERNEL);
> > +     if (!private)
> > +             goto out;
> > +
> > +     mt_init(&private->shareability);
> Wrap the mt_init() inside "#ifdef CONFIG_KVM_GMEM_SHARED_MEM" ?
>
> > +     inode->i_mapping->i_private_data = private;
> > +
> > +     err = kvm_gmem_shareability_setup(private, size, flags);
> > +     if (err)
> > +             goto out;
> > +
> >       inode->i_private = (void *)(unsigned long)flags;
> >       inode->i_op = &kvm_gmem_iops;
> >       inode->i_mapping->a_ops = &kvm_gmem_aops;
> > @@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
> >       WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
> >
> >       return inode;
> > +
> > +out:
> > +     iput(inode);
> > +
> > +     return ERR_PTR(err);
> >  }
> >
> >  static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
> > @@ -654,6 +769,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
> >       if (kvm_arch_vm_supports_gmem_shared_mem(kvm))
> >               valid_flags |= GUEST_MEMFD_FLAG_SUPPORT_SHARED;
> >
> > +     if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED)
> > +             valid_flags |= GUEST_MEMFD_FLAG_INIT_PRIVATE;
> > +
> >       if (flags & ~valid_flags)
> >               return -EINVAL;
> >
> > @@ -842,6 +960,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> >       if (!file)
> >               return -EFAULT;
> >
> > +     filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
> > +
> >       folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> >       if (IS_ERR(folio)) {
> >               r = PTR_ERR(folio);
> > @@ -857,8 +977,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> >               *page = folio_file_page(folio, index);
> >       else
> >               folio_put(folio);
> > -
> >  out:
> > +     filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
> >       fput(file);
> >       return r;
> >  }
> > --
> > 2.49.0.1045.g170613ef41-goog
> >
> >

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ackerley Tng 8 months, 1 week ago

Fuad Tabba <tabba@google.com> writes:

> Hi,
>
> .. snip..
>
>> I noticed that in [1], the kvm_gmem_mmap() does not check the range.
>> So, the WARN() here can be hit when userspace mmap() an area larger than the
>> inode size and accesses the out of band HVA.
>>
>> Maybe limit the mmap() range?
>>
>> @@ -1609,6 +1620,10 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
>>         if (!kvm_gmem_supports_shared(file_inode(file)))
>>                 return -ENODEV;
>>
>> +       if (vma->vm_end - vma->vm_start + (vma->vm_pgoff << PAGE_SHIFT) > i_size_read(file_inode(file)))
>> +               return -EINVAL;
>> +
>>         if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
>>             (VM_SHARED | VM_MAYSHARE)) {
>>                 return -EINVAL;
>>
>> [1] https://lore.kernel.org/all/20250513163438.3942405-8-tabba@google.com/
>
> I don't think we want to do that for a couple of reasons. We catch
> such invalid accesses on faulting, and, by analogy, afaikt, neither
> secretmem nor memfd perform a similar check on mmap (nor do
> memory-mapped files in general).
>
> There are also valid reasons why a user would want to deliberately
> mmap more memory than the backing store, knowing that it's only going
> to fault what it's going to use, e.g., alignment.
>

This is a good point.

I think there's no check against the inode size on faulting now though?
v10's [1] kvm_gmem_fault_shared() calls kvm_gmem_get_folio()
straightaway.

We should add a check like [2] to kvm_gmem_fault_shared().

[1] https://lore.kernel.org/all/20250513163438.3942405-8-tabba@google.com/
[2] https://github.com/torvalds/linux/blob/8477ab143069c6b05d6da4a8184ded8b969240f5/mm/filemap.c#L3373

> Cheers,
> /fuad
>
>
>> > +     return xa_to_value(entry);
>> > +}
>> > +
>> > +static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
>> > +{
>> > +     if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
>> > +             return ERR_PTR(-EACCES);
>> > +
>> > +     return kvm_gmem_get_folio(inode, index);
>> > +}
>> > +
>> > +#else
>> > +
>> > +static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
>> > +{
>> > +     return 0;
>> > +}
>> > +
>> > +static inline struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
>> > +{
>> > +     WARN_ONCE("Unexpected call to get shared folio.")
>> > +     return NULL;
>> > +}
>> > +
>> > +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
>> > +
>> >  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>> >                                   pgoff_t index, struct folio *folio)
>> >  {
>> > @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
>> >
>> >       filemap_invalidate_lock_shared(inode->i_mapping);
>> >
>> > -     folio = kvm_gmem_get_folio(inode, vmf->pgoff);
>> > +     folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
>> >       if (IS_ERR(folio)) {
>> >               int err = PTR_ERR(folio);
>> >
>> > @@ -420,8 +491,33 @@ static struct file_operations kvm_gmem_fops = {
>> >       .fallocate      = kvm_gmem_fallocate,
>> >  };
>> >
>> > +static void kvm_gmem_free_inode(struct inode *inode)
>> > +{
>> > +     struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
>> > +
>> > +     kfree(private);
>> > +
>> > +     free_inode_nonrcu(inode);
>> > +}
>> > +
>> > +static void kvm_gmem_destroy_inode(struct inode *inode)
>> > +{
>> > +     struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
>> > +
>> > +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
>> > +     /*
>> > +      * mtree_destroy() can't be used within rcu callback, hence can't be
>> > +      * done in ->free_inode().
>> > +      */
>> > +     if (private)
>> > +             mtree_destroy(&private->shareability);
>> > +#endif
>> > +}
>> > +
>> >  static const struct super_operations kvm_gmem_super_operations = {
>> >       .statfs         = simple_statfs,
>> > +     .destroy_inode  = kvm_gmem_destroy_inode,
>> > +     .free_inode     = kvm_gmem_free_inode,
>> >  };
>> >
>> >  static int kvm_gmem_init_fs_context(struct fs_context *fc)
>> > @@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
>> >  static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>> >                                                     loff_t size, u64 flags)
>> >  {
>> > +     struct kvm_gmem_inode_private *private;
>> >       struct inode *inode;
>> > +     int err;
>> >
>> >       inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
>> >       if (IS_ERR(inode))
>> >               return inode;
>> >
>> > +     err = -ENOMEM;
>> > +     private = kzalloc(sizeof(*private), GFP_KERNEL);
>> > +     if (!private)
>> > +             goto out;
>> > +
>> > +     mt_init(&private->shareability);
>> Wrap the mt_init() inside "#ifdef CONFIG_KVM_GMEM_SHARED_MEM" ?
>>
>> > +     inode->i_mapping->i_private_data = private;
>> > +
>> > +     err = kvm_gmem_shareability_setup(private, size, flags);
>> > +     if (err)
>> > +             goto out;
>> > +
>> >       inode->i_private = (void *)(unsigned long)flags;
>> >       inode->i_op = &kvm_gmem_iops;
>> >       inode->i_mapping->a_ops = &kvm_gmem_aops;
>> > @@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>> >       WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
>> >
>> >       return inode;
>> > +
>> > +out:
>> > +     iput(inode);
>> > +
>> > +     return ERR_PTR(err);
>> >  }
>> >
>> >  static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
>> > @@ -654,6 +769,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
>> >       if (kvm_arch_vm_supports_gmem_shared_mem(kvm))
>> >               valid_flags |= GUEST_MEMFD_FLAG_SUPPORT_SHARED;
>> >
>> > +     if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED)
>> > +             valid_flags |= GUEST_MEMFD_FLAG_INIT_PRIVATE;
>> > +
>> >       if (flags & ~valid_flags)
>> >               return -EINVAL;
>> >
>> > @@ -842,6 +960,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>> >       if (!file)
>> >               return -EFAULT;
>> >
>> > +     filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
>> > +
>> >       folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
>> >       if (IS_ERR(folio)) {
>> >               r = PTR_ERR(folio);
>> > @@ -857,8 +977,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>> >               *page = folio_file_page(folio, index);
>> >       else
>> >               folio_put(folio);
>> > -
>> >  out:
>> > +     filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
>> >       fput(file);
>> >       return r;
>> >  }
>> > --
>> > 2.49.0.1045.g170613ef41-goog
>> >
>> >

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Fuad Tabba 8 months, 1 week ago

Hi Ackerley,

On Fri, 30 May 2025 at 19:32, Ackerley Tng <ackerleytng@google.com> wrote:
>
> Fuad Tabba <tabba@google.com> writes:
>
> > Hi,
> >
> > .. snip..
> >
> >> I noticed that in [1], the kvm_gmem_mmap() does not check the range.
> >> So, the WARN() here can be hit when userspace mmap() an area larger than the
> >> inode size and accesses the out of band HVA.
> >>
> >> Maybe limit the mmap() range?
> >>
> >> @@ -1609,6 +1620,10 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
> >>         if (!kvm_gmem_supports_shared(file_inode(file)))
> >>                 return -ENODEV;
> >>
> >> +       if (vma->vm_end - vma->vm_start + (vma->vm_pgoff << PAGE_SHIFT) > i_size_read(file_inode(file)))
> >> +               return -EINVAL;
> >> +
> >>         if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
> >>             (VM_SHARED | VM_MAYSHARE)) {
> >>                 return -EINVAL;
> >>
> >> [1] https://lore.kernel.org/all/20250513163438.3942405-8-tabba@google.com/
> >
> > I don't think we want to do that for a couple of reasons. We catch
> > such invalid accesses on faulting, and, by analogy, afaikt, neither
> > secretmem nor memfd perform a similar check on mmap (nor do
> > memory-mapped files in general).
> >
> > There are also valid reasons why a user would want to deliberately
> > mmap more memory than the backing store, knowing that it's only going
> > to fault what it's going to use, e.g., alignment.
> >
>
> This is a good point.
>
> I think there's no check against the inode size on faulting now though?
> v10's [1] kvm_gmem_fault_shared() calls kvm_gmem_get_folio()
> straightaway.
>
> We should add a check like [2] to kvm_gmem_fault_shared().

Yes! I mistakenly thought that kvm_gmem_get_folio() had such a check,
I just verified that it doesn't. I have added the check, as well as a
new selftest to make sure we don't miss it in the future.

Thanks!
/fuad

> [1] https://lore.kernel.org/all/20250513163438.3942405-8-tabba@google.com/
> [2] https://github.com/torvalds/linux/blob/8477ab143069c6b05d6da4a8184ded8b969240f5/mm/filemap.c#L3373
>
> > Cheers,
> > /fuad
> >
> >
> >> > +     return xa_to_value(entry);
> >> > +}
> >> > +
> >> > +static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> >> > +{
> >> > +     if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
> >> > +             return ERR_PTR(-EACCES);
> >> > +
> >> > +     return kvm_gmem_get_folio(inode, index);
> >> > +}
> >> > +
> >> > +#else
> >> > +
> >> > +static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
> >> > +{
> >> > +     return 0;
> >> > +}
> >> > +
> >> > +static inline struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
> >> > +{
> >> > +     WARN_ONCE("Unexpected call to get shared folio.")
> >> > +     return NULL;
> >> > +}
> >> > +
> >> > +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
> >> > +
> >> >  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
> >> >                                   pgoff_t index, struct folio *folio)
> >> >  {
> >> > @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
> >> >
> >> >       filemap_invalidate_lock_shared(inode->i_mapping);
> >> >
> >> > -     folio = kvm_gmem_get_folio(inode, vmf->pgoff);
> >> > +     folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
> >> >       if (IS_ERR(folio)) {
> >> >               int err = PTR_ERR(folio);
> >> >
> >> > @@ -420,8 +491,33 @@ static struct file_operations kvm_gmem_fops = {
> >> >       .fallocate      = kvm_gmem_fallocate,
> >> >  };
> >> >
> >> > +static void kvm_gmem_free_inode(struct inode *inode)
> >> > +{
> >> > +     struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> >> > +
> >> > +     kfree(private);
> >> > +
> >> > +     free_inode_nonrcu(inode);
> >> > +}
> >> > +
> >> > +static void kvm_gmem_destroy_inode(struct inode *inode)
> >> > +{
> >> > +     struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
> >> > +
> >> > +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> >> > +     /*
> >> > +      * mtree_destroy() can't be used within rcu callback, hence can't be
> >> > +      * done in ->free_inode().
> >> > +      */
> >> > +     if (private)
> >> > +             mtree_destroy(&private->shareability);
> >> > +#endif
> >> > +}
> >> > +
> >> >  static const struct super_operations kvm_gmem_super_operations = {
> >> >       .statfs         = simple_statfs,
> >> > +     .destroy_inode  = kvm_gmem_destroy_inode,
> >> > +     .free_inode     = kvm_gmem_free_inode,
> >> >  };
> >> >
> >> >  static int kvm_gmem_init_fs_context(struct fs_context *fc)
> >> > @@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
> >> >  static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
> >> >                                                     loff_t size, u64 flags)
> >> >  {
> >> > +     struct kvm_gmem_inode_private *private;
> >> >       struct inode *inode;
> >> > +     int err;
> >> >
> >> >       inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
> >> >       if (IS_ERR(inode))
> >> >               return inode;
> >> >
> >> > +     err = -ENOMEM;
> >> > +     private = kzalloc(sizeof(*private), GFP_KERNEL);
> >> > +     if (!private)
> >> > +             goto out;
> >> > +
> >> > +     mt_init(&private->shareability);
> >> Wrap the mt_init() inside "#ifdef CONFIG_KVM_GMEM_SHARED_MEM" ?
> >>
> >> > +     inode->i_mapping->i_private_data = private;
> >> > +
> >> > +     err = kvm_gmem_shareability_setup(private, size, flags);
> >> > +     if (err)
> >> > +             goto out;
> >> > +
> >> >       inode->i_private = (void *)(unsigned long)flags;
> >> >       inode->i_op = &kvm_gmem_iops;
> >> >       inode->i_mapping->a_ops = &kvm_gmem_aops;
> >> > @@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
> >> >       WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
> >> >
> >> >       return inode;
> >> > +
> >> > +out:
> >> > +     iput(inode);
> >> > +
> >> > +     return ERR_PTR(err);
> >> >  }
> >> >
> >> >  static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
> >> > @@ -654,6 +769,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
> >> >       if (kvm_arch_vm_supports_gmem_shared_mem(kvm))
> >> >               valid_flags |= GUEST_MEMFD_FLAG_SUPPORT_SHARED;
> >> >
> >> > +     if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED)
> >> > +             valid_flags |= GUEST_MEMFD_FLAG_INIT_PRIVATE;
> >> > +
> >> >       if (flags & ~valid_flags)
> >> >               return -EINVAL;
> >> >
> >> > @@ -842,6 +960,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> >> >       if (!file)
> >> >               return -EFAULT;
> >> >
> >> > +     filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
> >> > +
> >> >       folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
> >> >       if (IS_ERR(folio)) {
> >> >               r = PTR_ERR(folio);
> >> > @@ -857,8 +977,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> >> >               *page = folio_file_page(folio, index);
> >> >       else
> >> >               folio_put(folio);
> >> > -
> >> >  out:
> >> > +     filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
> >> >       fput(file);
> >> >       return r;
> >> >  }
> >> > --
> >> > 2.49.0.1045.g170613ef41-goog
> >> >
> >> >

Re: [RFC PATCH v2 02/51] KVM: guest_memfd: Introduce and use shareability to guard faulting

Posted by Ackerley Tng 8 months, 2 weeks ago

Yan Zhao <yan.y.zhao@intel.com> writes:

> On Wed, May 14, 2025 at 04:41:41PM -0700, Ackerley Tng wrote:
>> Track guest_memfd memory's shareability status within the inode as
>> opposed to the file, since it is property of the guest_memfd's memory
>> contents.
>> 
>> Shareability is a property of the memory and is indexed using the
>> page's index in the inode. Because shareability is the memory's
>> property, it is stored within guest_memfd instead of within KVM, like
>> in kvm->mem_attr_array.
>> 
>> KVM_MEMORY_ATTRIBUTE_PRIVATE in kvm->mem_attr_array must still be
>> retained to allow VMs to only use guest_memfd for private memory and
>> some other memory for shared memory.
>> 
>> Not all use cases require guest_memfd() to be shared with the host
>> when first created. Add a new flag, GUEST_MEMFD_FLAG_INIT_PRIVATE,
>> which when set on KVM_CREATE_GUEST_MEMFD, initializes the memory as
>> private to the guest, and therefore not mappable by the
>> host. Otherwise, memory is shared until explicitly converted to
>> private.
>> 
>> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
>> Co-developed-by: Vishal Annapurve <vannapurve@google.com>
>> Signed-off-by: Vishal Annapurve <vannapurve@google.com>
>> Co-developed-by: Fuad Tabba <tabba@google.com>
>> Signed-off-by: Fuad Tabba <tabba@google.com>
>> Change-Id: If03609cbab3ad1564685c85bdba6dcbb6b240c0f
>> ---
>>  Documentation/virt/kvm/api.rst |   5 ++
>>  include/uapi/linux/kvm.h       |   2 +
>>  virt/kvm/guest_memfd.c         | 124 ++++++++++++++++++++++++++++++++-
>>  3 files changed, 129 insertions(+), 2 deletions(-)
>> 
>> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
>> index 86f74ce7f12a..f609337ae1c2 100644
>> --- a/Documentation/virt/kvm/api.rst
>> +++ b/Documentation/virt/kvm/api.rst
>> @@ -6408,6 +6408,11 @@ belonging to the slot via its userspace_addr.
>>  The use of GUEST_MEMFD_FLAG_SUPPORT_SHARED will not be allowed for CoCo VMs.
>>  This is validated when the guest_memfd instance is bound to the VM.
>>  
>> +If the capability KVM_CAP_GMEM_CONVERSIONS is supported, then the 'flags' field
>> +supports GUEST_MEMFD_FLAG_INIT_PRIVATE.  Setting GUEST_MEMFD_FLAG_INIT_PRIVATE
>> +will initialize the memory for the guest_memfd as guest-only and not faultable
>> +by the host.
>> +
>>  See KVM_SET_USER_MEMORY_REGION2 for additional details.
>>  
>>  4.143 KVM_PRE_FAULT_MEMORY
>> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
>> index 4cc824a3a7c9..d7df312479aa 100644
>> --- a/include/uapi/linux/kvm.h
>> +++ b/include/uapi/linux/kvm.h
>> @@ -1567,7 +1567,9 @@ struct kvm_memory_attributes {
>>  #define KVM_MEMORY_ATTRIBUTE_PRIVATE           (1ULL << 3)
>>  
>>  #define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
>> +
>>  #define GUEST_MEMFD_FLAG_SUPPORT_SHARED	(1UL << 0)
>> +#define GUEST_MEMFD_FLAG_INIT_PRIVATE	(1UL << 1)
>>  
>>  struct kvm_create_guest_memfd {
>>  	__u64 size;
>> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>> index 239d0f13dcc1..590932499eba 100644
>> --- a/virt/kvm/guest_memfd.c
>> +++ b/virt/kvm/guest_memfd.c
>> @@ -4,6 +4,7 @@
>>  #include <linux/falloc.h>
>>  #include <linux/fs.h>
>>  #include <linux/kvm_host.h>
>> +#include <linux/maple_tree.h>
>>  #include <linux/pseudo_fs.h>
>>  #include <linux/pagemap.h>
>>  
>> @@ -17,6 +18,24 @@ struct kvm_gmem {
>>  	struct list_head entry;
>>  };
>>  
>> +struct kvm_gmem_inode_private {
>> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
>> +	struct maple_tree shareability;
>> +#endif
>> +};
>> +
>> +enum shareability {
>> +	SHAREABILITY_GUEST = 1,	/* Only the guest can map (fault) folios in this range. */
>> +	SHAREABILITY_ALL = 2,	/* Both guest and host can fault folios in this range. */
>> +};
>> +
>> +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
>> +
>> +static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
>> +{
>> +	return inode->i_mapping->i_private_data;
>> +}
>> +
>>  /**
>>   * folio_file_pfn - like folio_file_page, but return a pfn.
>>   * @folio: The folio which contains this index.
>> @@ -29,6 +48,58 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
>>  	return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
>>  }
>>  
>> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
>> +
>> +static int kvm_gmem_shareability_setup(struct kvm_gmem_inode_private *private,
>> +				      loff_t size, u64 flags)
>> +{
>> +	enum shareability m;
>> +	pgoff_t last;
>> +
>> +	last = (size >> PAGE_SHIFT) - 1;
>> +	m = flags & GUEST_MEMFD_FLAG_INIT_PRIVATE ? SHAREABILITY_GUEST :
>> +						    SHAREABILITY_ALL;
>> +	return mtree_store_range(&private->shareability, 0, last, xa_mk_value(m),
>> +				 GFP_KERNEL);
>> +}
>> +
>> +static enum shareability kvm_gmem_shareability_get(struct inode *inode,
>> +						 pgoff_t index)
>> +{
>> +	struct maple_tree *mt;
>> +	void *entry;
>> +
>> +	mt = &kvm_gmem_private(inode)->shareability;
>> +	entry = mtree_load(mt, index);
>> +	WARN(!entry,
>> +	     "Shareability should always be defined for all indices in inode.");
> I noticed that in [1], the kvm_gmem_mmap() does not check the range.
> So, the WARN() here can be hit when userspace mmap() an area larger than the
> inode size and accesses the out of band HVA.
>
> Maybe limit the mmap() range?
>
> @@ -1609,6 +1620,10 @@ static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
>         if (!kvm_gmem_supports_shared(file_inode(file)))
>                 return -ENODEV;
>
> +       if (vma->vm_end - vma->vm_start + (vma->vm_pgoff << PAGE_SHIFT) > i_size_read(file_inode(file)))
> +               return -EINVAL;
> +
>         if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) !=
>             (VM_SHARED | VM_MAYSHARE)) {
>                 return -EINVAL;
>
> [1] https://lore.kernel.org/all/20250513163438.3942405-8-tabba@google.com/
>

This is a good idea. Thanks! I also think it is a good idea to include
this with the guest_memfd mmap base series that Fuad is working on [1],
maybe in v11.

[1] https://lore.kernel.org/all/20250527180245.1413463-1-tabba@google.com/

>> +	return xa_to_value(entry);
>> +}
>> +
>> +static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
>> +{
>> +	if (kvm_gmem_shareability_get(inode, index) != SHAREABILITY_ALL)
>> +		return ERR_PTR(-EACCES);
>> +
>> +	return kvm_gmem_get_folio(inode, index);
>> +}
>> +
>> +#else
>> +
>> +static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
>> +{
>> +	return 0;
>> +}
>> +
>> +static inline struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t index)
>> +{
>> +	WARN_ONCE("Unexpected call to get shared folio.")
>> +	return NULL;
>> +}
>> +
>> +#endif /* CONFIG_KVM_GMEM_SHARED_MEM */
>> +
>>  static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>>  				    pgoff_t index, struct folio *folio)
>>  {
>> @@ -333,7 +404,7 @@ static vm_fault_t kvm_gmem_fault_shared(struct vm_fault *vmf)
>>  
>>  	filemap_invalidate_lock_shared(inode->i_mapping);
>>  
>> -	folio = kvm_gmem_get_folio(inode, vmf->pgoff);
>> +	folio = kvm_gmem_get_shared_folio(inode, vmf->pgoff);
>>  	if (IS_ERR(folio)) {
>>  		int err = PTR_ERR(folio);
>>  
>> @@ -420,8 +491,33 @@ static struct file_operations kvm_gmem_fops = {
>>  	.fallocate	= kvm_gmem_fallocate,
>>  };
>>  
>> +static void kvm_gmem_free_inode(struct inode *inode)
>> +{
>> +	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
>> +
>> +	kfree(private);
>> +
>> +	free_inode_nonrcu(inode);
>> +}
>> +
>> +static void kvm_gmem_destroy_inode(struct inode *inode)
>> +{
>> +	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
>> +
>> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
>> +	/*
>> +	 * mtree_destroy() can't be used within rcu callback, hence can't be
>> +	 * done in ->free_inode().
>> +	 */
>> +	if (private)
>> +		mtree_destroy(&private->shareability);
>> +#endif
>> +}
>> +
>>  static const struct super_operations kvm_gmem_super_operations = {
>>  	.statfs		= simple_statfs,
>> +	.destroy_inode	= kvm_gmem_destroy_inode,
>> +	.free_inode	= kvm_gmem_free_inode,
>>  };
>>  
>>  static int kvm_gmem_init_fs_context(struct fs_context *fc)
>> @@ -549,12 +645,26 @@ static const struct inode_operations kvm_gmem_iops = {
>>  static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>>  						      loff_t size, u64 flags)
>>  {
>> +	struct kvm_gmem_inode_private *private;
>>  	struct inode *inode;
>> +	int err;
>>  
>>  	inode = alloc_anon_secure_inode(kvm_gmem_mnt->mnt_sb, name);
>>  	if (IS_ERR(inode))
>>  		return inode;
>>  
>> +	err = -ENOMEM;
>> +	private = kzalloc(sizeof(*private), GFP_KERNEL);
>> +	if (!private)
>> +		goto out;
>> +
>> +	mt_init(&private->shareability);
> Wrap the mt_init() inside "#ifdef CONFIG_KVM_GMEM_SHARED_MEM" ?
>

Will fix this in the next revision. Will also update this to only
initialize shareability if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED).

>> +	inode->i_mapping->i_private_data = private;
>> +
>> +	err = kvm_gmem_shareability_setup(private, size, flags);
>> +	if (err)
>> +		goto out;
>> +
>>  	inode->i_private = (void *)(unsigned long)flags;
>>  	inode->i_op = &kvm_gmem_iops;
>>  	inode->i_mapping->a_ops = &kvm_gmem_aops;
>> @@ -566,6 +676,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
>>  	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
>>  
>>  	return inode;
>> +
>> +out:
>> +	iput(inode);
>> +
>> +	return ERR_PTR(err);
>>  }
>>  
>>  static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
>> @@ -654,6 +769,9 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
>>  	if (kvm_arch_vm_supports_gmem_shared_mem(kvm))
>>  		valid_flags |= GUEST_MEMFD_FLAG_SUPPORT_SHARED;
>>  
>> +	if (flags & GUEST_MEMFD_FLAG_SUPPORT_SHARED)
>> +		valid_flags |= GUEST_MEMFD_FLAG_INIT_PRIVATE;
>> +
>>  	if (flags & ~valid_flags)
>>  		return -EINVAL;
>>  
>> @@ -842,6 +960,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>>  	if (!file)
>>  		return -EFAULT;
>>  
>> +	filemap_invalidate_lock_shared(file_inode(file)->i_mapping);
>> +
>>  	folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
>>  	if (IS_ERR(folio)) {
>>  		r = PTR_ERR(folio);
>> @@ -857,8 +977,8 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
>>  		*page = folio_file_page(folio, index);
>>  	else
>>  		folio_put(folio);
>> -
>>  out:
>> +	filemap_invalidate_unlock_shared(file_inode(file)->i_mapping);
>>  	fput(file);
>>  	return r;
>>  }
>> -- 
>> 2.49.0.1045.g170613ef41-goog
>> 
>>