[RFC PATCH 14/39] KVM: guest_memfd: hugetlb: initialization and cleanup

Ackerley Tng posted 39 patches 2 months, 2 weeks ago
[RFC PATCH 14/39] KVM: guest_memfd: hugetlb: initialization and cleanup
Posted by Ackerley Tng 2 months, 2 weeks ago
First stage of hugetlb support: add initialization and cleanup
routines.

After guest_mem was massaged to use guest_mem inodes instead of
anonymous inodes in an earlier patch, the .evict_inode handler can now
be overridden to do hugetlb metadata cleanup.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
 include/uapi/linux/kvm.h |  26 ++++++
 virt/kvm/guest_memfd.c   | 177 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 197 insertions(+), 6 deletions(-)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 637efc055145..77de7c4432f6 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -13,6 +13,7 @@
 #include <linux/compiler.h>
 #include <linux/ioctl.h>
 #include <asm/kvm.h>
+#include <asm-generic/hugetlb_encode.h>
 
 #define KVM_API_VERSION 12
 
@@ -1558,6 +1559,31 @@ struct kvm_memory_attributes {
 
 #define KVM_CREATE_GUEST_MEMFD	_IOWR(KVMIO,  0xd4, struct kvm_create_guest_memfd)
 
+#define KVM_GUEST_MEMFD_HUGETLB (1ULL << 1)
+
+/*
+ * Huge page size encoding when KVM_GUEST_MEMFD_HUGETLB is specified, and a huge
+ * page size other than the default is desired.  See hugetlb_encode.h.  All
+ * known huge page size encodings are provided here.  It is the responsibility
+ * of the application to know which sizes are supported on the running system.
+ * See mmap(2) man page for details.
+ */
+#define KVM_GUEST_MEMFD_HUGE_SHIFT     HUGETLB_FLAG_ENCODE_SHIFT
+#define KVM_GUEST_MEMFD_HUGE_MASK      HUGETLB_FLAG_ENCODE_MASK
+
+#define KVM_GUEST_MEMFD_HUGE_64KB      HUGETLB_FLAG_ENCODE_64KB
+#define KVM_GUEST_MEMFD_HUGE_512KB     HUGETLB_FLAG_ENCODE_512KB
+#define KVM_GUEST_MEMFD_HUGE_1MB       HUGETLB_FLAG_ENCODE_1MB
+#define KVM_GUEST_MEMFD_HUGE_2MB       HUGETLB_FLAG_ENCODE_2MB
+#define KVM_GUEST_MEMFD_HUGE_8MB       HUGETLB_FLAG_ENCODE_8MB
+#define KVM_GUEST_MEMFD_HUGE_16MB      HUGETLB_FLAG_ENCODE_16MB
+#define KVM_GUEST_MEMFD_HUGE_32MB      HUGETLB_FLAG_ENCODE_32MB
+#define KVM_GUEST_MEMFD_HUGE_256MB     HUGETLB_FLAG_ENCODE_256MB
+#define KVM_GUEST_MEMFD_HUGE_512MB     HUGETLB_FLAG_ENCODE_512MB
+#define KVM_GUEST_MEMFD_HUGE_1GB       HUGETLB_FLAG_ENCODE_1GB
+#define KVM_GUEST_MEMFD_HUGE_2GB       HUGETLB_FLAG_ENCODE_2GB
+#define KVM_GUEST_MEMFD_HUGE_16GB      HUGETLB_FLAG_ENCODE_16GB
+
 struct kvm_create_guest_memfd {
 	__u64 size;
 	__u64 flags;
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 5d7fd1f708a6..31e1115273e1 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -3,6 +3,7 @@
 #include <linux/mount.h>
 #include <linux/backing-dev.h>
 #include <linux/falloc.h>
+#include <linux/hugetlb.h>
 #include <linux/kvm_host.h>
 #include <linux/pseudo_fs.h>
 #include <linux/pagemap.h>
@@ -18,6 +19,16 @@ struct kvm_gmem {
 	struct list_head entry;
 };
 
+struct kvm_gmem_hugetlb {
+	struct hstate *h;
+	struct hugepage_subpool *spool;
+};
+
+static struct kvm_gmem_hugetlb *kvm_gmem_hgmem(struct inode *inode)
+{
+	return inode->i_mapping->i_private_data;
+}
+
 /**
  * folio_file_pfn - like folio_file_page, but return a pfn.
  * @folio: The folio which contains this index.
@@ -154,6 +165,82 @@ static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
 	}
 }
 
+static inline void kvm_gmem_hugetlb_filemap_remove_folio(struct folio *folio)
+{
+	folio_lock(folio);
+
+	folio_clear_dirty(folio);
+	folio_clear_uptodate(folio);
+	filemap_remove_folio(folio);
+
+	folio_unlock(folio);
+}
+
+/**
+ * Removes folios in range [@lstart, @lend) from page cache/filemap (@mapping),
+ * returning the number of pages freed.
+ */
+static int kvm_gmem_hugetlb_filemap_remove_folios(struct address_space *mapping,
+						  struct hstate *h,
+						  loff_t lstart, loff_t lend)
+{
+	const pgoff_t end = lend >> PAGE_SHIFT;
+	pgoff_t next = lstart >> PAGE_SHIFT;
+	struct folio_batch fbatch;
+	int num_freed = 0;
+
+	folio_batch_init(&fbatch);
+	while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
+		int i;
+		for (i = 0; i < folio_batch_count(&fbatch); ++i) {
+			struct folio *folio;
+			pgoff_t hindex;
+			u32 hash;
+
+			folio = fbatch.folios[i];
+			hindex = folio->index >> huge_page_order(h);
+			hash = hugetlb_fault_mutex_hash(mapping, hindex);
+
+			mutex_lock(&hugetlb_fault_mutex_table[hash]);
+			kvm_gmem_hugetlb_filemap_remove_folio(folio);
+			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+			num_freed++;
+		}
+		folio_batch_release(&fbatch);
+		cond_resched();
+	}
+
+	return num_freed;
+}
+
+/**
+ * Removes folios in range [@lstart, @lend) from page cache of inode, updates
+ * inode metadata and hugetlb reservations.
+ */
+static void kvm_gmem_hugetlb_truncate_folios_range(struct inode *inode,
+						   loff_t lstart, loff_t lend)
+{
+	struct kvm_gmem_hugetlb *hgmem;
+	struct hstate *h;
+	int gbl_reserve;
+	int num_freed;
+
+	hgmem = kvm_gmem_hgmem(inode);
+	h = hgmem->h;
+
+	num_freed = kvm_gmem_hugetlb_filemap_remove_folios(inode->i_mapping,
+							   h, lstart, lend);
+
+	gbl_reserve = hugepage_subpool_put_pages(hgmem->spool, num_freed);
+	hugetlb_acct_memory(h, -gbl_reserve);
+
+	spin_lock(&inode->i_lock);
+	inode->i_blocks -= blocks_per_huge_page(h) * num_freed;
+	spin_unlock(&inode->i_lock);
+}
+
+
 static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 {
 	struct list_head *gmem_list = &inode->i_mapping->i_private_list;
@@ -307,8 +394,33 @@ static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
 	return get_file_active(&slot->gmem.file);
 }
 
+static void kvm_gmem_hugetlb_teardown(struct inode *inode)
+{
+	struct kvm_gmem_hugetlb *hgmem;
+
+	truncate_inode_pages_final_prepare(inode->i_mapping);
+	kvm_gmem_hugetlb_truncate_folios_range(inode, 0, LLONG_MAX);
+
+	hgmem = kvm_gmem_hgmem(inode);
+	hugepage_put_subpool(hgmem->spool);
+	kfree(hgmem);
+}
+
+static void kvm_gmem_evict_inode(struct inode *inode)
+{
+	u64 flags = (u64)inode->i_private;
+
+	if (flags & KVM_GUEST_MEMFD_HUGETLB)
+		kvm_gmem_hugetlb_teardown(inode);
+	else
+		truncate_inode_pages_final(inode->i_mapping);
+
+	clear_inode(inode);
+}
+
 static const struct super_operations kvm_gmem_super_operations = {
 	.statfs		= simple_statfs,
+	.evict_inode	= kvm_gmem_evict_inode,
 };
 
 static int kvm_gmem_init_fs_context(struct fs_context *fc)
@@ -431,6 +543,42 @@ static const struct inode_operations kvm_gmem_iops = {
 	.setattr	= kvm_gmem_setattr,
 };
 
+static int kvm_gmem_hugetlb_setup(struct inode *inode, loff_t size, u64 flags)
+{
+	struct kvm_gmem_hugetlb *hgmem;
+	struct hugepage_subpool *spool;
+	int page_size_log;
+	struct hstate *h;
+	long hpages;
+
+	page_size_log = (flags >> KVM_GUEST_MEMFD_HUGE_SHIFT) & KVM_GUEST_MEMFD_HUGE_MASK;
+	h = hstate_sizelog(page_size_log);
+
+	/* Round up to accommodate size requests that don't align with huge pages */
+	hpages = round_up(size, huge_page_size(h)) >> huge_page_shift(h);
+
+	spool = hugepage_new_subpool(h, hpages, hpages, false);
+	if (!spool)
+		goto err;
+
+	hgmem = kzalloc(sizeof(*hgmem), GFP_KERNEL);
+	if (!hgmem)
+		goto err_subpool;
+
+	inode->i_blkbits = huge_page_shift(h);
+
+	hgmem->h = h;
+	hgmem->spool = spool;
+	inode->i_mapping->i_private_data = hgmem;
+
+	return 0;
+
+err_subpool:
+	kfree(spool);
+err:
+	return -ENOMEM;
+}
+
 static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
 						      loff_t size, u64 flags)
 {
@@ -443,9 +591,13 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
 		return inode;
 
 	err = security_inode_init_security_anon(inode, &qname, NULL);
-	if (err) {
-		iput(inode);
-		return ERR_PTR(err);
+	if (err)
+		goto out;
+
+	if (flags & KVM_GUEST_MEMFD_HUGETLB) {
+		err = kvm_gmem_hugetlb_setup(inode, size, flags);
+		if (err)
+			goto out;
 	}
 
 	inode->i_private = (void *)(unsigned long)flags;
@@ -459,6 +611,11 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
 	WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
 
 	return inode;
+
+out:
+	iput(inode);
+
+	return ERR_PTR(err);
 }
 
 static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size,
@@ -526,14 +683,22 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
 	return err;
 }
 
+#define KVM_GUEST_MEMFD_ALL_FLAGS KVM_GUEST_MEMFD_HUGETLB
+
 int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
 {
 	loff_t size = args->size;
 	u64 flags = args->flags;
-	u64 valid_flags = 0;
 
-	if (flags & ~valid_flags)
-		return -EINVAL;
+	if (flags & KVM_GUEST_MEMFD_HUGETLB) {
+		/* Allow huge page size encoding in flags */
+		if (flags & ~(KVM_GUEST_MEMFD_ALL_FLAGS |
+			      (KVM_GUEST_MEMFD_HUGE_MASK << KVM_GUEST_MEMFD_HUGE_SHIFT)))
+			return -EINVAL;
+	} else {
+		if (flags & ~KVM_GUEST_MEMFD_ALL_FLAGS)
+			return -EINVAL;
+	}
 
 	if (size <= 0 || !PAGE_ALIGNED(size))
 		return -EINVAL;
-- 
2.46.0.598.g6f2099f65c-goog
Re: [RFC PATCH 14/39] KVM: guest_memfd: hugetlb: initialization and cleanup
Posted by Vishal Annapurve 2 months, 1 week ago
On Wed, Sep 11, 2024 at 1:44 AM Ackerley Tng <ackerleytng@google.com> wrote:
>
> ...
> +}
> +
> +static void kvm_gmem_evict_inode(struct inode *inode)
> +{
> +       u64 flags = (u64)inode->i_private;
> +
> +       if (flags & KVM_GUEST_MEMFD_HUGETLB)
> +               kvm_gmem_hugetlb_teardown(inode);
> +       else
> +               truncate_inode_pages_final(inode->i_mapping);
> +
> +       clear_inode(inode);
> +}
> +
>  static const struct super_operations kvm_gmem_super_operations = {
>         .statfs         = simple_statfs,
> +       .evict_inode    = kvm_gmem_evict_inode,

Ackerley, can we use free_inode[1] callback to free any special
metadata associated with the inode instead of relying on
super_operations?

[1] https://elixir.bootlin.com/linux/v6.11/source/include/linux/fs.h#L719

> ...


>
>         if (size <= 0 || !PAGE_ALIGNED(size))
>                 return -EINVAL;
> --
> 2.46.0.598.g6f2099f65c-goog
>
Re: [RFC PATCH 14/39] KVM: guest_memfd: hugetlb: initialization and cleanup
Posted by Ackerley Tng 1 month, 4 weeks ago
Vishal Annapurve <vannapurve@google.com> writes:

> On Wed, Sep 11, 2024 at 1:44 AM Ackerley Tng <ackerleytng@google.com> wrote:
>>
>> ...
>> +}
>> +
>> +static void kvm_gmem_evict_inode(struct inode *inode)
>> +{
>> +       u64 flags = (u64)inode->i_private;
>> +
>> +       if (flags & KVM_GUEST_MEMFD_HUGETLB)
>> +               kvm_gmem_hugetlb_teardown(inode);
>> +       else
>> +               truncate_inode_pages_final(inode->i_mapping);
>> +
>> +       clear_inode(inode);
>> +}
>> +
>>  static const struct super_operations kvm_gmem_super_operations = {
>>         .statfs         = simple_statfs,
>> +       .evict_inode    = kvm_gmem_evict_inode,
>
> Ackerley, can we use free_inode[1] callback to free any special
> metadata associated with the inode instead of relying on
> super_operations?
>
> [1] https://elixir.bootlin.com/linux/v6.11/source/include/linux/fs.h#L719
>

.free_inode() is not a direct replacement for .evict_inode().

If the .free_inode() op is NULL, free_inode_nonrcu(inode) handles freeing the
struct inode itself. Hence, the .free_inode() op is meant for freeing the inode
struct.

.free_inode() should undo what .alloc_inode() does.

There's more information about the ops free_inode() here
https://docs.kernel.org/filesystems/porting.html, specifically

| Rules for inode destruction:
|
| + if ->destroy_inode() is non-NULL, it gets called
| + if ->free_inode() is non-NULL, it gets scheduled by call_rcu()
| + combination of NULL ->destroy_inode and NULL ->free_inode is treated as
|   NULL/free_inode_nonrcu, to preserve the compatibility.

The common setup is to have a larger containing struct containing a struct
inode, and the .free_inode() op will then free the larger struct. In our case,
we're not using a containing struct for the metadata, so .free_inode() isn't the
appropriate op.

I think this question might be related to Sean's question at LPC about whether
it is necessary for guest_memfd to have its own mount, as opposed to using the
anon_inode_mnt.

I believe having its own mount is the correct approach, my reasoning is as
follows

1. We want to clean up these inode metadata when the last reference to the inode
   is dropped
2. That means using some op on the iput_final() path.
3. All the ops on the iput_final() path are in struct super_operations, which is
   part of struct super_block
4. struct super_block should be used together with a mount

Hence, I think it is correct to have a guest_memfd mount. I guess it might be
possible to have a static super_block without a mount, but that seems hacky and
brittle, and I'm not aware of any precedent for a static super_block.

Sean, what are your concerns with having a guest_memfd mount?

Comparing the callbacks along the iput_final() path, we have these:

+ .drop_inode() determines whether to evict the inode, so that's not the
  approprate op.
+ .evict_inode() is the current proposal, which is a place where the inode's
  fields are cleaned up. HugeTLB uses this to clean up resv_map, which it also
  stores in inode->i_mapping->i_private_data.
+ .destroy_inode() should clean up inode allocation if inode allocation involves
  a containing struct (like shmem_inode_info). Shmem uses this to clean up a
  struct shared_policy, which we will eventually need to store as well.
+ .free_inode() is the rcu-delayed part that completes inode cleanup.

Using .free_inode() implies using a containing struct to follow the
convention. Between putting metadata in a containing struct and using
inode->i_mapping->i_private_data, I think using inode->i_mapping->i_private_data
is less complex since it avoids needing a custom .alloc_inode() op.

Other than using inode->i_mapping->i_private_data, there's the option of
combining the metadata with guest_memfd flags, and storing everything in
inode->i_private.

Because inode->i_mapping actually points to inode->i_data and i_data is
a part of the inode (not a pointer), .evict_inode() is still the op to
use to clean both inode->i_mapping->i_private_data and inode->i_private.

I think we should stick with storing metadata (faultability xarray and hugetlb
pool reference) in inode->i_mapping->i_private_data because both of these are
properties of the page cache/filemap.

When we need to store a memory policy, we might want to use .destroy_inode() to
align with shmem.

What do you all think?

And there's no way to set inode->free_inode directly and skip copying
from inode->i_sb->s_op. All the code paths going to i_callback() copy
inode->i_sb->s_op->free_inode to inode->free_inode before calling
.free_inode() in i_callback() to complete the inode cleanup.