If HugeTLB is requested at guest_memfd creation time, HugeTLB pages
will be used to back guest_memfd.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
virt/kvm/guest_memfd.c | 252 ++++++++++++++++++++++++++++++++++++++---
1 file changed, 239 insertions(+), 13 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 31e1115273e1..2e6f12e2bac8 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -8,6 +8,8 @@
#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
#include <linux/anon_inodes.h>
+#include <linux/memcontrol.h>
+#include <linux/mempolicy.h>
#include "kvm_mm.h"
@@ -29,6 +31,13 @@ static struct kvm_gmem_hugetlb *kvm_gmem_hgmem(struct inode *inode)
return inode->i_mapping->i_private_data;
}
+static bool is_kvm_gmem_hugetlb(struct inode *inode)
+{
+ u64 flags = (u64)inode->i_private;
+
+ return flags & KVM_GUEST_MEMFD_HUGETLB;
+}
+
/**
* folio_file_pfn - like folio_file_page, but return a pfn.
* @folio: The folio which contains this index.
@@ -58,6 +67,9 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
return 0;
}
+/**
+ * Use the uptodate flag to indicate that the folio is prepared for KVM's usage.
+ */
static inline void kvm_gmem_mark_prepared(struct folio *folio)
{
folio_mark_uptodate(folio);
@@ -72,13 +84,18 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio)
static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
gfn_t gfn, struct folio *folio)
{
- unsigned long nr_pages, i;
pgoff_t index;
int r;
- nr_pages = folio_nr_pages(folio);
- for (i = 0; i < nr_pages; i++)
- clear_highpage(folio_page(folio, i));
+ if (folio_test_hugetlb(folio)) {
+ folio_zero_user(folio, folio->index << PAGE_SHIFT);
+ } else {
+ unsigned long nr_pages, i;
+
+ nr_pages = folio_nr_pages(folio);
+ for (i = 0; i < nr_pages; i++)
+ clear_highpage(folio_page(folio, i));
+ }
/*
* Preparing huge folios should always be safe, since it should
@@ -103,6 +120,174 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
return r;
}
+static int kvm_gmem_get_mpol_node_nodemask(gfp_t gfp_mask,
+ struct mempolicy **mpol,
+ nodemask_t **nodemask)
+{
+ /*
+ * TODO: mempolicy would probably have to be stored on the inode, use
+ * task policy for now.
+ */
+ *mpol = get_task_policy(current);
+
+ /* TODO: ignore interleaving (set ilx to 0) for now. */
+ return policy_node_nodemask(*mpol, gfp_mask, 0, nodemask);
+}
+
+static struct folio *kvm_gmem_hugetlb_alloc_folio(struct hstate *h,
+ struct hugepage_subpool *spool)
+{
+ bool memcg_charge_was_prepared;
+ struct mem_cgroup *memcg;
+ struct mempolicy *mpol;
+ nodemask_t *nodemask;
+ struct folio *folio;
+ gfp_t gfp_mask;
+ int ret;
+ int nid;
+
+ gfp_mask = htlb_alloc_mask(h);
+
+ memcg = get_mem_cgroup_from_current();
+ ret = mem_cgroup_hugetlb_try_charge(memcg,
+ gfp_mask | __GFP_RETRY_MAYFAIL,
+ pages_per_huge_page(h));
+ if (ret == -ENOMEM)
+ goto err;
+
+ memcg_charge_was_prepared = ret != -EOPNOTSUPP;
+
+ /* Pages are only to be taken from guest_memfd subpool and nowhere else. */
+ if (hugepage_subpool_get_pages(spool, 1))
+ goto err_cancel_charge;
+
+ nid = kvm_gmem_get_mpol_node_nodemask(htlb_alloc_mask(h), &mpol,
+ &nodemask);
+ /*
+ * charge_cgroup_reservation is false because we didn't make any cgroup
+ * reservations when creating the guest_memfd subpool.
+ *
+ * use_hstate_resv is true because we reserved from global hstate when
+ * creating the guest_memfd subpool.
+ */
+ folio = hugetlb_alloc_folio(h, mpol, nid, nodemask, false, true);
+ mpol_cond_put(mpol);
+
+ if (!folio)
+ goto err_put_pages;
+
+ hugetlb_set_folio_subpool(folio, spool);
+
+ if (memcg_charge_was_prepared)
+ mem_cgroup_commit_charge(folio, memcg);
+
+out:
+ mem_cgroup_put(memcg);
+
+ return folio;
+
+err_put_pages:
+ hugepage_subpool_put_pages(spool, 1);
+
+err_cancel_charge:
+ if (memcg_charge_was_prepared)
+ mem_cgroup_cancel_charge(memcg, pages_per_huge_page(h));
+
+err:
+ folio = ERR_PTR(-ENOMEM);
+ goto out;
+}
+
+static int kvm_gmem_hugetlb_filemap_add_folio(struct address_space *mapping,
+ struct folio *folio, pgoff_t index,
+ gfp_t gfp)
+{
+ int ret;
+
+ __folio_set_locked(folio);
+ ret = __filemap_add_folio(mapping, folio, index, gfp, NULL);
+ if (unlikely(ret)) {
+ __folio_clear_locked(folio);
+ return ret;
+ }
+
+ /*
+ * In hugetlb_add_to_page_cache(), there is a call to
+ * folio_clear_hugetlb_restore_reserve(). This is handled when the pages
+ * are removed from the page cache in unmap_hugepage_range() ->
+ * __unmap_hugepage_range() by conditionally calling
+ * folio_set_hugetlb_restore_reserve(). In kvm_gmem_hugetlb's usage of
+ * hugetlb, there are no VMAs involved, and pages are never taken from
+ * the surplus, so when pages are freed, the hstate reserve must be
+ * restored. Hence, this function makes no call to
+ * folio_clear_hugetlb_restore_reserve().
+ */
+
+ /* mark folio dirty so that it will not be removed from cache/inode */
+ folio_mark_dirty(folio);
+
+ return 0;
+}
+
+static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode,
+ pgoff_t index)
+{
+ struct kvm_gmem_hugetlb *hgmem;
+ struct folio *folio;
+ int ret;
+
+ hgmem = kvm_gmem_hgmem(inode);
+ folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool);
+ if (IS_ERR(folio))
+ return folio;
+
+ /* TODO: Fix index here to be aligned to huge page size. */
+ ret = kvm_gmem_hugetlb_filemap_add_folio(
+ inode->i_mapping, folio, index, htlb_alloc_mask(hgmem->h));
+ if (ret) {
+ folio_put(folio);
+ return ERR_PTR(ret);
+ }
+
+ spin_lock(&inode->i_lock);
+ inode->i_blocks += blocks_per_huge_page(hgmem->h);
+ spin_unlock(&inode->i_lock);
+
+ return folio;
+}
+
+static struct folio *kvm_gmem_get_hugetlb_folio(struct inode *inode,
+ pgoff_t index)
+{
+ struct address_space *mapping;
+ struct folio *folio;
+ struct hstate *h;
+ pgoff_t hindex;
+ u32 hash;
+
+ h = kvm_gmem_hgmem(inode)->h;
+ hindex = index >> huge_page_order(h);
+ mapping = inode->i_mapping;
+
+ /* To lock, we calculate the hash using the hindex and not index. */
+ hash = hugetlb_fault_mutex_hash(mapping, hindex);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ /*
+ * The filemap is indexed with index and not hindex. Taking lock on
+ * folio to align with kvm_gmem_get_regular_folio()
+ */
+ folio = filemap_lock_folio(mapping, index);
+ if (!IS_ERR(folio))
+ goto out;
+
+ folio = kvm_gmem_hugetlb_alloc_and_cache_folio(inode, index);
+out:
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+ return folio;
+}
+
/*
* Returns a locked folio on success. The caller is responsible for
* setting the up-to-date flag before the memory is mapped into the guest.
@@ -114,8 +299,10 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
*/
static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
{
- /* TODO: Support huge pages. */
- return filemap_grab_folio(inode->i_mapping, index);
+ if (is_kvm_gmem_hugetlb(inode))
+ return kvm_gmem_get_hugetlb_folio(inode, index);
+ else
+ return filemap_grab_folio(inode->i_mapping, index);
}
static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
@@ -240,6 +427,35 @@ static void kvm_gmem_hugetlb_truncate_folios_range(struct inode *inode,
spin_unlock(&inode->i_lock);
}
+static void kvm_gmem_hugetlb_truncate_range(struct inode *inode, loff_t lstart,
+ loff_t lend)
+{
+ loff_t full_hpage_start;
+ loff_t full_hpage_end;
+ unsigned long hsize;
+ struct hstate *h;
+
+ h = kvm_gmem_hgmem(inode)->h;
+ hsize = huge_page_size(h);
+
+ full_hpage_start = round_up(lstart, hsize);
+ full_hpage_end = round_down(lend, hsize);
+
+ if (lstart < full_hpage_start) {
+ hugetlb_zero_partial_page(h, inode->i_mapping, lstart,
+ full_hpage_start);
+ }
+
+ if (full_hpage_end > full_hpage_start) {
+ kvm_gmem_hugetlb_truncate_folios_range(inode, full_hpage_start,
+ full_hpage_end);
+ }
+
+ if (lend > full_hpage_end) {
+ hugetlb_zero_partial_page(h, inode->i_mapping, full_hpage_end,
+ lend);
+ }
+}
static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
@@ -257,7 +473,12 @@ static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
list_for_each_entry(gmem, gmem_list, entry)
kvm_gmem_invalidate_begin(gmem, start, end);
- truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
+ if (is_kvm_gmem_hugetlb(inode)) {
+ kvm_gmem_hugetlb_truncate_range(inode, offset, offset + len);
+ } else {
+ truncate_inode_pages_range(inode->i_mapping, offset,
+ offset + len - 1);
+ }
list_for_each_entry(gmem, gmem_list, entry)
kvm_gmem_invalidate_end(gmem, start, end);
@@ -279,8 +500,15 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
filemap_invalidate_lock_shared(mapping);
- start = offset >> PAGE_SHIFT;
- end = (offset + len) >> PAGE_SHIFT;
+ if (is_kvm_gmem_hugetlb(inode)) {
+ unsigned long hsize = huge_page_size(kvm_gmem_hgmem(inode)->h);
+
+ start = round_down(offset, hsize) >> PAGE_SHIFT;
+ end = round_down(offset + len, hsize) >> PAGE_SHIFT;
+ } else {
+ start = offset >> PAGE_SHIFT;
+ end = (offset + len) >> PAGE_SHIFT;
+ }
r = 0;
for (index = start; index < end; ) {
@@ -408,9 +636,7 @@ static void kvm_gmem_hugetlb_teardown(struct inode *inode)
static void kvm_gmem_evict_inode(struct inode *inode)
{
- u64 flags = (u64)inode->i_private;
-
- if (flags & KVM_GUEST_MEMFD_HUGETLB)
+ if (is_kvm_gmem_hugetlb(inode))
kvm_gmem_hugetlb_teardown(inode);
else
truncate_inode_pages_final(inode->i_mapping);
@@ -827,7 +1053,7 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot,
*pfn = folio_file_pfn(folio, index);
if (max_order)
- *max_order = 0;
+ *max_order = folio_order(folio);
*is_prepared = folio_test_uptodate(folio);
return folio;
--
2.46.0.598.g6f2099f65c-goog
Hi Ackerley, Due to actual customer requirements(such as ByteDance), I have added support for NUMA policy based on your foundation. Standing on the shoulders of giants, please correct me if there is anyting wrong. --- Thanks Jun.miao On 2024/9/11 07:43, Ackerley Tng wrote: > If HugeTLB is requested at guest_memfd creation time, HugeTLB pages > will be used to back guest_memfd. > > Signed-off-by: Ackerley Tng <ackerleytng@google.com> > --- > virt/kvm/guest_memfd.c | 252 ++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 239 insertions(+), 13 deletions(-) > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index 31e1115273e1..2e6f12e2bac8 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -8,6 +8,8 @@ > #include <linux/pseudo_fs.h> > #include <linux/pagemap.h> > #include <linux/anon_inodes.h> > +#include <linux/memcontrol.h> > +#include <linux/mempolicy.h> > > #include "kvm_mm.h" > > @@ -29,6 +31,13 @@ static struct kvm_gmem_hugetlb *kvm_gmem_hgmem(struct inode *inode) > return inode->i_mapping->i_private_data; > } > > +static bool is_kvm_gmem_hugetlb(struct inode *inode) > +{ > + u64 flags = (u64)inode->i_private; > + > + return flags & KVM_GUEST_MEMFD_HUGETLB; > +} > + > /** > * folio_file_pfn - like folio_file_page, but return a pfn. > * @folio: The folio which contains this index. > @@ -58,6 +67,9 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo > return 0; > } > > +/** > + * Use the uptodate flag to indicate that the folio is prepared for KVM's usage. > + */ > static inline void kvm_gmem_mark_prepared(struct folio *folio) > { > folio_mark_uptodate(folio); > @@ -72,13 +84,18 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio) > static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, > gfn_t gfn, struct folio *folio) > { > - unsigned long nr_pages, i; > pgoff_t index; > int r; > > - nr_pages = folio_nr_pages(folio); > - for (i = 0; i < nr_pages; i++) > - clear_highpage(folio_page(folio, i)); > + if (folio_test_hugetlb(folio)) { > + folio_zero_user(folio, folio->index << PAGE_SHIFT); > + } else { > + unsigned long nr_pages, i; > + > + nr_pages = folio_nr_pages(folio); > + for (i = 0; i < nr_pages; i++) > + clear_highpage(folio_page(folio, i)); > + } > > /* > * Preparing huge folios should always be safe, since it should > @@ -103,6 +120,174 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, > return r; > } > > +static int kvm_gmem_get_mpol_node_nodemask(gfp_t gfp_mask, > + struct mempolicy **mpol, > + nodemask_t **nodemask) > +{ > + /* > + * TODO: mempolicy would probably have to be stored on the inode, use > + * task policy for now. > + */ > + *mpol = get_task_policy(current); commit bbb0b86af11574516fe78bc1340f49c9e6b7e588 (HEAD -> my-gmem-hugetlb-rfc-v2) Author: Jun Miao <jun.miao@intel.com> Date: Wed Oct 30 11:07:16 2024 -0400 KVM: guest_memfd: add TDX numa policy in hugetlb support Support the numa policy in the gmem hugetlb. This function need the corresponding QEMU patch cooperate to work, and set the numa policy like this in qemu: "--object host-nodes=0,policy=bind". If no set in the Qemu, the policy uses current task policy for now. Signed-off-by: Jun Miao <jun.miao@intel.com> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index a49631e47421..cf569fe0740d 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -91,6 +91,17 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol) return pol; } +struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, + nodemask_t *nodes); + +int mpol_set_nodemask(struct mempolicy *pol, + const nodemask_t *nodes, struct nodemask_scratch *nsc); + +int sanitize_mpol_flags(int *mode, unsigned short *flags); + +int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, + unsigned long maxnode); + static inline void mpol_get(struct mempolicy *pol) { if (pol) @@ -202,6 +213,25 @@ static inline void mpol_cond_put(struct mempolicy *pol) { } +struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, + nodemask_t *nodes); +{ +} + +int mpol_set_nodemask(struct mempolicy *pol, + const nodemask_t *nodes, struct nodemask_scratch *nsc); +{ +} + +int sanitize_mpol_flags(int *mode, unsigned short *flags); +{ +} + +int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, + unsigned long maxnode); +{ +} + static inline void mpol_get(struct mempolicy *pol) { } diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 1f9bb10d1a47..6ba4eb0935de 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -24,6 +24,7 @@ enum { MPOL_LOCAL, MPOL_PREFERRED_MANY, MPOL_WEIGHTED_INTERLEAVE, + MPOL_INVALID, /* Invalid parameter passing, come from and keep consistent with QEMU */ MPOL_MAX, /* always last member of enum */ }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f3e572e17775..b465ed5091c2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -259,7 +259,7 @@ static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_lock for write. */ -static int mpol_set_nodemask(struct mempolicy *pol, +int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes, struct nodemask_scratch *nsc) { int ret; @@ -291,12 +291,13 @@ static int mpol_set_nodemask(struct mempolicy *pol, ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); return ret; } +EXPORT_SYMBOL_GPL(mpol_set_nodemask); /* * This function just creates a new policy, does some check and simple * initialization. You must invoke mpol_set_nodemask() to set nodes. */ -static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, +struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *policy; @@ -339,6 +340,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, return policy; } +EXPORT_SYMBOL_GPL(mpol_new); /* Slow path of a mpol destructor. */ void __mpol_put(struct mempolicy *pol) @@ -1429,7 +1431,7 @@ static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask, } /* Copy a node mask from user space. */ -static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, +int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { --maxnode; @@ -1463,6 +1465,7 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, return get_bitmap(nodes_addr(*nodes), nmask, maxnode); } +EXPORT_SYMBOL(get_nodes); /* Copy a kernel node mask to user space */ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, @@ -1492,7 +1495,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, } /* Basic parameter sanity check used by both mbind() and set_mempolicy() */ -static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) +inline int sanitize_mpol_flags(int *mode, unsigned short *flags) { *flags = *mode & MPOL_MODE_FLAGS; *mode &= ~MPOL_MODE_FLAGS; @@ -1509,6 +1512,7 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags) } return 0; } +EXPORT_SYMBOL_GPL(sanitize_mpol_flags); static long kernel_mbind(unsigned long start, unsigned long len, unsigned long mode, const unsigned long __user *nmask, diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index f34aff971628..7570aa38e519 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -20,6 +20,7 @@ struct kvm_gmem { struct kvm *kvm; struct xarray bindings; struct list_head entry; + struct mempolicy *gmemfd_policy; }; struct kvm_gmem_hugetlb { @@ -154,21 +155,21 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, return r; } -static int kvm_gmem_get_mpol_node_nodemask(gfp_t gfp_mask, +static int kvm_gmem_get_mpol_node_nodemask(struct kvm_gmem *gmem, gfp_t gfp_mask, struct mempolicy **mpol, nodemask_t **nodemask) { /* - * TODO: mempolicy would probably have to be stored on the inode, use - * task policy for now. + * Mempolicy would probably have to be stored on the inode, if no setting in qeum + * use task policy for now. */ - *mpol = get_task_policy(current); + *mpol = gmem->gmemfd_policy; /* TODO: ignore interleaving (set ilx to 0) for now. */ return policy_node_nodemask(*mpol, gfp_mask, 0, nodemask); } -static struct folio *kvm_gmem_hugetlb_alloc_folio(struct hstate *h, +static struct folio *kvm_gmem_hugetlb_alloc_folio(struct kvm_gmem *gmem, struct hstate *h, struct hugepage_subpool *spool) { bool memcg_charge_was_prepared; @@ -195,7 +196,7 @@ static struct folio *kvm_gmem_hugetlb_alloc_folio(struct hstate *h, if (hugepage_subpool_get_pages(spool, 1)) goto err_cancel_charge; - nid = kvm_gmem_get_mpol_node_nodemask(htlb_alloc_mask(h), &mpol, + nid = kvm_gmem_get_mpol_node_nodemask(gmem, htlb_alloc_mask(h), &mpol, &nodemask); /* * charge_cgroup_reservation is false because we didn't make any cgroup @@ -268,10 +269,12 @@ static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode, { struct kvm_gmem_hugetlb *hgmem; struct folio *folio; + struct kvm_gmem *gmem; int ret; hgmem = kvm_gmem_hgmem(inode); - folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool); + gmem = inode->i_mapping->i_private_data; + folio = kvm_gmem_hugetlb_alloc_folio(gmem, hgmem->h, hgmem->spool); if (IS_ERR(folio)) return folio; @@ -905,7 +908,7 @@ static struct file *kvm_gmem_inode_create_getfile(void *priv, loff_t size, return file; } -static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) +static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags, struct mempolicy *new) { struct kvm_gmem *gmem; struct file *file; @@ -927,6 +930,8 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) goto err_gmem; } + file_inode(file)->i_mapping->i_private_data = gmem; + gmem->gmemfd_policy = new; kvm_get_kvm(kvm); gmem->kvm = kvm; xa_init(&gmem->bindings); @@ -955,6 +960,40 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) { loff_t size = args->size; u64 flags = args->flags; + nodemask_t nodes; + struct mempolicy *new; + int err, ret; + u64 mode = args->reserved[0]; + u64 maxnode = args->reserved[1]; + const unsigned long host_nodes = (unsigned long)args->reserved[2]; + unsigned short mode_flags; + int lmode = mode; + NODEMASK_SCRATCH(scratch); + if(!scratch) + return -ENOMEM; + + if (mode == MPOL_INVALID) + goto task_policy; + else { + err = sanitize_mpol_flags(&lmode, &mode_flags); + if (err) + goto task_policy; + + err = get_nodes(&nodes, &host_nodes, maxnode); + if (err) + goto task_policy; + + new = mpol_new(mode, mode_flags, &nodes); + if (IS_ERR(new)) + goto task_policy; + else + goto numa_policy; +} + +task_policy: + new = get_task_policy(current); +numa_policy: + ret = mpol_set_nodemask(new, &nodes, scratch); if (flags & KVM_GUEST_MEMFD_HUGETLB) { /* Allow huge page size encoding in flags */ @@ -975,7 +1014,7 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) if (size <= 0) return -EINVAL; - return __kvm_gmem_create(kvm, size, flags); + return __kvm_gmem_create(kvm, size, flags, new); } int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, (END) > + > + /* TODO: ignore interleaving (set ilx to 0) for now. */ > + return policy_node_nodemask(*mpol, gfp_mask, 0, nodemask); > +} > + > +static struct folio *kvm_gmem_hugetlb_alloc_folio(struct hstate *h, > + struct hugepage_subpool *spool) > +{ > + bool memcg_charge_was_prepared; > + struct mem_cgroup *memcg; > + struct mempolicy *mpol; > + nodemask_t *nodemask; > + struct folio *folio; > + gfp_t gfp_mask; > + int ret; > + int nid; > + > + gfp_mask = htlb_alloc_mask(h); > + > + memcg = get_mem_cgroup_from_current(); > + ret = mem_cgroup_hugetlb_try_charge(memcg, > + gfp_mask | __GFP_RETRY_MAYFAIL, > + pages_per_huge_page(h)); > + if (ret == -ENOMEM) > + goto err; > + > + memcg_charge_was_prepared = ret != -EOPNOTSUPP; > + > + /* Pages are only to be taken from guest_memfd subpool and nowhere else. */ > + if (hugepage_subpool_get_pages(spool, 1)) > + goto err_cancel_charge; > + > + nid = kvm_gmem_get_mpol_node_nodemask(htlb_alloc_mask(h), &mpol, > + &nodemask); > + /* > + * charge_cgroup_reservation is false because we didn't make any cgroup > + * reservations when creating the guest_memfd subpool. > + * > + * use_hstate_resv is true because we reserved from global hstate when > + * creating the guest_memfd subpool. > + */ > + folio = hugetlb_alloc_folio(h, mpol, nid, nodemask, false, true); > + mpol_cond_put(mpol); > + > + if (!folio) > + goto err_put_pages; > + > + hugetlb_set_folio_subpool(folio, spool); > + > + if (memcg_charge_was_prepared) > + mem_cgroup_commit_charge(folio, memcg); > + > +out: > + mem_cgroup_put(memcg); > + > + return folio; > + > +err_put_pages: > + hugepage_subpool_put_pages(spool, 1); > + > +err_cancel_charge: > + if (memcg_charge_was_prepared) > + mem_cgroup_cancel_charge(memcg, pages_per_huge_page(h)); > + > +err: > + folio = ERR_PTR(-ENOMEM); > + goto out; > +} > + > +static int kvm_gmem_hugetlb_filemap_add_folio(struct address_space *mapping, > + struct folio *folio, pgoff_t index, > + gfp_t gfp) > +{ > + int ret; > + > + __folio_set_locked(folio); > + ret = __filemap_add_folio(mapping, folio, index, gfp, NULL); > + if (unlikely(ret)) { > + __folio_clear_locked(folio); > + return ret; > + } > + > + /* > + * In hugetlb_add_to_page_cache(), there is a call to > + * folio_clear_hugetlb_restore_reserve(). This is handled when the pages > + * are removed from the page cache in unmap_hugepage_range() -> > + * __unmap_hugepage_range() by conditionally calling > + * folio_set_hugetlb_restore_reserve(). In kvm_gmem_hugetlb's usage of > + * hugetlb, there are no VMAs involved, and pages are never taken from > + * the surplus, so when pages are freed, the hstate reserve must be > + * restored. Hence, this function makes no call to > + * folio_clear_hugetlb_restore_reserve(). > + */ > + > + /* mark folio dirty so that it will not be removed from cache/inode */ > + folio_mark_dirty(folio); > + > + return 0; > +} > + > +static struct folio *kvm_gmem_hugetlb_alloc_and_cache_folio(struct inode *inode, > + pgoff_t index) > +{ > + struct kvm_gmem_hugetlb *hgmem; > + struct folio *folio; > + int ret; > + > + hgmem = kvm_gmem_hgmem(inode); > + folio = kvm_gmem_hugetlb_alloc_folio(hgmem->h, hgmem->spool); > + if (IS_ERR(folio)) > + return folio; > + > + /* TODO: Fix index here to be aligned to huge page size. */ > + ret = kvm_gmem_hugetlb_filemap_add_folio( > + inode->i_mapping, folio, index, htlb_alloc_mask(hgmem->h)); > + if (ret) { > + folio_put(folio); > + return ERR_PTR(ret); > + } > + > + spin_lock(&inode->i_lock); > + inode->i_blocks += blocks_per_huge_page(hgmem->h); > + spin_unlock(&inode->i_lock); > + > + return folio; > +} > + > +static struct folio *kvm_gmem_get_hugetlb_folio(struct inode *inode, > + pgoff_t index) > +{ > + struct address_space *mapping; > + struct folio *folio; > + struct hstate *h; > + pgoff_t hindex; > + u32 hash; > + > + h = kvm_gmem_hgmem(inode)->h; > + hindex = index >> huge_page_order(h); > + mapping = inode->i_mapping; > + > + /* To lock, we calculate the hash using the hindex and not index. */ > + hash = hugetlb_fault_mutex_hash(mapping, hindex); > + mutex_lock(&hugetlb_fault_mutex_table[hash]); > + > + /* > + * The filemap is indexed with index and not hindex. Taking lock on > + * folio to align with kvm_gmem_get_regular_folio() > + */ > + folio = filemap_lock_folio(mapping, index); > + if (!IS_ERR(folio)) > + goto out; > + > + folio = kvm_gmem_hugetlb_alloc_and_cache_folio(inode, index); > +out: > + mutex_unlock(&hugetlb_fault_mutex_table[hash]); > + > + return folio; > +} > + > /* > * Returns a locked folio on success. The caller is responsible for > * setting the up-to-date flag before the memory is mapped into the guest. > @@ -114,8 +299,10 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, > */ > static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) > { > - /* TODO: Support huge pages. */ > - return filemap_grab_folio(inode->i_mapping, index); > + if (is_kvm_gmem_hugetlb(inode)) > + return kvm_gmem_get_hugetlb_folio(inode, index); > + else > + return filemap_grab_folio(inode->i_mapping, index); > } > > static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, > @@ -240,6 +427,35 @@ static void kvm_gmem_hugetlb_truncate_folios_range(struct inode *inode, > spin_unlock(&inode->i_lock); > } > > +static void kvm_gmem_hugetlb_truncate_range(struct inode *inode, loff_t lstart, > + loff_t lend) > +{ > + loff_t full_hpage_start; > + loff_t full_hpage_end; > + unsigned long hsize; > + struct hstate *h; > + > + h = kvm_gmem_hgmem(inode)->h; > + hsize = huge_page_size(h); > + > + full_hpage_start = round_up(lstart, hsize); > + full_hpage_end = round_down(lend, hsize); > + > + if (lstart < full_hpage_start) { > + hugetlb_zero_partial_page(h, inode->i_mapping, lstart, > + full_hpage_start); > + } > + > + if (full_hpage_end > full_hpage_start) { > + kvm_gmem_hugetlb_truncate_folios_range(inode, full_hpage_start, > + full_hpage_end); > + } > + > + if (lend > full_hpage_end) { > + hugetlb_zero_partial_page(h, inode->i_mapping, full_hpage_end, > + lend); > + } > +} > > static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) > { > @@ -257,7 +473,12 @@ static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) > list_for_each_entry(gmem, gmem_list, entry) > kvm_gmem_invalidate_begin(gmem, start, end); > > - truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); > + if (is_kvm_gmem_hugetlb(inode)) { > + kvm_gmem_hugetlb_truncate_range(inode, offset, offset + len); > + } else { > + truncate_inode_pages_range(inode->i_mapping, offset, > + offset + len - 1); > + } > > list_for_each_entry(gmem, gmem_list, entry) > kvm_gmem_invalidate_end(gmem, start, end); > @@ -279,8 +500,15 @@ static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) > > filemap_invalidate_lock_shared(mapping); > > - start = offset >> PAGE_SHIFT; > - end = (offset + len) >> PAGE_SHIFT; > + if (is_kvm_gmem_hugetlb(inode)) { > + unsigned long hsize = huge_page_size(kvm_gmem_hgmem(inode)->h); > + > + start = round_down(offset, hsize) >> PAGE_SHIFT; > + end = round_down(offset + len, hsize) >> PAGE_SHIFT; > + } else { > + start = offset >> PAGE_SHIFT; > + end = (offset + len) >> PAGE_SHIFT; > + } > > r = 0; > for (index = start; index < end; ) { > @@ -408,9 +636,7 @@ static void kvm_gmem_hugetlb_teardown(struct inode *inode) > > static void kvm_gmem_evict_inode(struct inode *inode) > { > - u64 flags = (u64)inode->i_private; > - > - if (flags & KVM_GUEST_MEMFD_HUGETLB) > + if (is_kvm_gmem_hugetlb(inode)) > kvm_gmem_hugetlb_teardown(inode); > else > truncate_inode_pages_final(inode->i_mapping); > @@ -827,7 +1053,7 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, > > *pfn = folio_file_pfn(folio, index); > if (max_order) > - *max_order = 0; > + *max_order = folio_order(folio); > > *is_prepared = folio_test_uptodate(folio); > return folio;
On Tue, Sep 10, 2024 at 11:43:46PM +0000, Ackerley Tng wrote: > If HugeTLB is requested at guest_memfd creation time, HugeTLB pages > will be used to back guest_memfd. > > Signed-off-by: Ackerley Tng <ackerleytng@google.com> > --- > virt/kvm/guest_memfd.c | 252 ++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 239 insertions(+), 13 deletions(-) > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index 31e1115273e1..2e6f12e2bac8 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -8,6 +8,8 @@ > #include <linux/pseudo_fs.h> > #include <linux/pagemap.h> > #include <linux/anon_inodes.h> > +#include <linux/memcontrol.h> > +#include <linux/mempolicy.h> > > #include "kvm_mm.h" > > @@ -29,6 +31,13 @@ static struct kvm_gmem_hugetlb *kvm_gmem_hgmem(struct inode *inode) > return inode->i_mapping->i_private_data; > } > > +static bool is_kvm_gmem_hugetlb(struct inode *inode) > +{ > + u64 flags = (u64)inode->i_private; > + > + return flags & KVM_GUEST_MEMFD_HUGETLB; > +} > + > /** > * folio_file_pfn - like folio_file_page, but return a pfn. > * @folio: The folio which contains this index. > @@ -58,6 +67,9 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo > return 0; > } > > +/** > + * Use the uptodate flag to indicate that the folio is prepared for KVM's usage. > + */ > static inline void kvm_gmem_mark_prepared(struct folio *folio) > { > folio_mark_uptodate(folio); > @@ -72,13 +84,18 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio) > static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, > gfn_t gfn, struct folio *folio) > { > - unsigned long nr_pages, i; > pgoff_t index; > int r; > > - nr_pages = folio_nr_pages(folio); > - for (i = 0; i < nr_pages; i++) > - clear_highpage(folio_page(folio, i)); > + if (folio_test_hugetlb(folio)) { > + folio_zero_user(folio, folio->index << PAGE_SHIFT); Is (folio->index << PAGE_SHIFT) the right address hint to provide? I don't think we can say the folio will be mapped at this address since this value is an offset into the file. In most cases, I believe it won't be mapped anywhere since we just allocated it. Thanks, Elliot
Elliot Berman <quic_eberman@quicinc.com> writes: > On Tue, Sep 10, 2024 at 11:43:46PM +0000, Ackerley Tng wrote: >> If HugeTLB is requested at guest_memfd creation time, HugeTLB pages >> will be used to back guest_memfd. >> >> Signed-off-by: Ackerley Tng <ackerleytng@google.com> >> >> <snip> >> >> +/** >> + * Use the uptodate flag to indicate that the folio is prepared for KVM's usage. >> + */ >> static inline void kvm_gmem_mark_prepared(struct folio *folio) >> { >> folio_mark_uptodate(folio); >> @@ -72,13 +84,18 @@ static inline void kvm_gmem_mark_prepared(struct folio *folio) >> static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, >> gfn_t gfn, struct folio *folio) >> { >> - unsigned long nr_pages, i; >> pgoff_t index; >> int r; >> >> - nr_pages = folio_nr_pages(folio); >> - for (i = 0; i < nr_pages; i++) >> - clear_highpage(folio_page(folio, i)); >> + if (folio_test_hugetlb(folio)) { >> + folio_zero_user(folio, folio->index << PAGE_SHIFT); > > Is (folio->index << PAGE_SHIFT) the right address hint to provide? > I don't think we can say the folio will be mapped at this address since > this value is an offset into the file. In most cases, I believe it > won't be mapped anywhere since we just allocated it. vaddr in folio_zero_user(folio, vaddr) is eventually passed to clear_user_page(). clear_user_page() uses vaddr to clean up dcaches on some architectures, according to Documentation/core-api/cachetlb.rst. In this patch series, folio_zero_user() is used in 2 places: + kvm_gmem_prepare_folio() + kvm_gmem_fault() folio->index is valid by the time folio_zero_user() is called in kvm_gmem_prepare_folio(), because when kvm_gmem_prepare_folio() is called, the folio is already in the filemap, and folio->index is set when the folios is added to the filemap. In kvm_gmem_fault(), kvm_gmem_get_folio() also returns a folio in the filemap and so folio->index is valid by the tiem folio_zero_user() is called. Hence in both cases where folio_zero_user() is called, folio->index << PAGE_SHIFT returns the offset in the file. In hugetlb's fallocate, the offset within the file is passed in the call to folio_zero_user(), which is why the offset within the file was used here. In the next revision I will refactor this to something like kvm_gmem_prepare_folio_shared() and kvm_gmem_prepare_folio_private(). In kvm_gmem_prepare_folio_private(), folio->index << PAGE_SHIFT can still be passed as addr_hint to align with HugeTLB. When being prepared as a private folio, the folio will be mapped by KVM: addr_hint won't matter since this folio isn't going to be mapped into userspace. If the folio was previously used as a shared page, unmapping would have flushed the dcache. In kvm_gmem_prepare_folio_shared(), the folio will subsequently be mapped and vmf->real_address should be passed as addr_hint. Thanks for this question!
© 2016 - 2024 Red Hat, Inc.